def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs): """Create a new SelScraper thread Instance. Args: captcha_lock: To sync captcha solving (stdin) proxy: Optional, if set, use the proxy to route all scrapign through it. browser_num: A unique, semantic number for each thread. """ self.search_input = None threading.Thread.__init__(self) SearchEngineScrape.__init__(self, config, *args, **kwargs) self.browser_type = self.config.get('sel_browser', 'chrome').lower() self.browser_mode = self.config.get('browser_mode', 'headless').lower() self.browser_num = browser_num self.captcha_lock = captcha_lock self.scrape_method = 'selenium' # number of tabs per instance self.number_of_tabs = self.config.get('num_tabs', 1) self.xvfb_display = self.config.get('xvfb_display', None) self.search_param_values = self._get_search_param_values() self.user_agent = random_user_agent() # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__)
def search(self, rand=True, timeout=15): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ success = True self.build_search() #"tbs=cdr%3A1%2Ccd_min%3A2016%2Ccd_max%3A2015&ei=kdsMWtn0NumD0gLdvYWQDA&" ''' self.headers['User-Agent'] = random_user_agent(only_desktop=True) super().detection_prevention_sleep() super().keyword_info() print self.base_search_url + urlencode(self.search_params) sys.exit() ''' if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) try: super().detection_prevention_sleep() super().keyword_info() request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=timeout) self.requested_at = datetime.datetime.utcnow() self.html = request.text logger.debug( '[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params)) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) success = False except self.requests.Timeout as te: self.status = 'Connection timeout {}'.format(te) success = False except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) else: if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success
def search(self, rand=True, timeout=15): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ success = True self.build_search() if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) try: super().detection_prevention_sleep() super().keyword_info() if self.proxy is not None: request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=timeout, proxies={self.proxy.proto: '{}://{}:{}'.format(self.proxy.proto, self.proxy.host, self.proxy.port)}) else: request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=timeout) self.requested_at = datetime.datetime.utcnow() self.html = request.text logger.debug('[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params)) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) logger.error(self.Status) success = False exit(1) except self.requests.Timeout as te: self.status = 'Connection timeout {}'.format(te) logger.error(self.Status) success = False exit(1) except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) logger.error(self.Status) exit(1) else: if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success
def search(self, rand=True, timeout=30): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ success = True self.html = '' self.status = 'successful' self.build_search() if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) try: super().detection_prevention_sleep() super().keyword_info() proxy_string = '%s:%s@%s:%s' % (self.proxy.username, self.proxy.password, self.proxy.host, self.proxy.port) proxies = dict(http='http://'+proxy_string, https='https://'+proxy_string) request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=timeout, proxies=proxies) self.requested_at = datetime.datetime.utcnow() self.html = request.text logger.debug('[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params)) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) success = False except self.requests.Timeout as te: # logger.warning('{name}: request timeout'.format(name=self.name)) self.status = 'Connection timeout {}'.format(te) success = False except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) else: if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success
def _get_PhantomJS(self): try: service_args = [] if self.proxy: service_args.extend([ '--proxy={}:{}'.format(self.proxy.host, self.proxy.port), '--proxy-type={}'.format(self.proxy.proto), ]) if self.proxy.username and self.proxy.password: service_args.append( '--proxy-auth={}:{}'.format(self.proxy.username, self.proxy.password) ) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = random_user_agent(only_desktop=True) self.webdriver = webdriver.PhantomJS(executable_path=self.config['executable_path'], service_args=service_args, desired_capabilities=dcap) return True except WebDriverException as e: logger.error(e) return False
def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs): """Create a new SelScraper thread Instance. Args: captcha_lock: To sync captcha solving (stdin) proxy: Optional, if set, use the proxy to route all scrapign through it. browser_num: A unique, semantic number for each thread. """ self.search_input = None threading.Thread.__init__(self) SearchEngineScrape.__init__(self, config, *args, **kwargs) self.browser_type = self.config.get('sel_browser', 'chrome').lower() self.browser_mode = self.config.get('browser_mode', 'headless').lower() self.browser_num = browser_num self.captcha_lock = captcha_lock self.scrape_method = 'selenium' # number of tabs per instance self.number_of_tabs = self.config.get('num_tabs', 1) self.xvfb_display = self.config.get('xvfb_display', None) self.search_param_values = self._get_search_param_values() self.user_agent = random_user_agent() # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__)
def put_in_database(): # This block removes the database created in the previous scrape. It was easier to do this # than to remove the database code from this software. try: os.remove("google_scraper.db") print("\nSQLite database Removed!") except: print("\nNo database to remove") global counter counter += 1 print("The current count is: ", counter) try: search = scrape_with_config(config) except GoogleSearchError as e: print(e) # Connect to database db = pymysql.connect( user=aws_username, # Username of AWS database passwd=aws_password, # AWS Database password host=aws_host, # AWS Instance port=3306, database='google_scraper', charset='utf8mb4') cur = db.cursor() target_directory = './images/' try: os.mkdir(target_directory) except FileExistsError: pass try: for serp in search.serps: # Create a new record into serp table sql = "INSERT INTO serp (search_engine_name, scrape_method, requested_at, search_query) VALUES (%s, %s, %s, %s)" cur.execute(sql, (serp.search_engine_name, serp.scrape_method, serp.requested_at, serp.query)) serp_id = cur.lastrowid print("\nserp_id is: ", serp_id) db.commit() for link in serp.links: if link.has_image: link.has_image = 1 else: link.has_image = 0 if link.link_type == 'image_box': height = re.compile('height:(\d+)') h = height.search(link.image_dims) image_height = int(h.group(1)) width = re.compile('width:(\d+)') w = width.search(link.image_dims) image_width = int(w.group(1)) # IMAGE SAVE LOOP if counter % 24 == 0: print("\nStarting image save loop...: ") baseURL = 'https://www.google.com' #print("Creating imgURL ... ") # This `imgURL` link should take you to the Google Image Search result with the image in question "selected" or expanded imgURL = baseURL + link.link print("Creating 'candidate' ... \n", imgURL) # Create image output file name candidate = serp.query.replace(' ', '_') print(candidate) print("Creating candidate file name .... ") imageFile = str(serp_id) + '_' + str( link.id) + '_' + candidate + '_' + str( link.rank) + '.jpg' print(imageFile) # Initilize phantomjs headless browser #http://www.marinamele.com/selenium-tutorial-web-scraping-with-selenium-and-python #http://stackoverflow.com/questions/15388057/extract-link-from-xpath-using-selenium-webdriver-and-python dcap = dict(DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent"] = random_user_agent( only_desktop=True) driver = webdriver.PhantomJS('./phantomjs', desired_capabilities=dcap) driver.set_window_size(1020, 550) driver.wait = WebDriverWait(driver, 5) # Use the try statement if to continue to collect images even when a collection fails. # If you want scraping to quit when image collection fails, take out the try statement. try: driver.get(imgURL) # CURENT_XPATH freqently changes, so keep an eye on this. If your images suddenly fail to download, this is where to check first. element = driver.wait.until( EC.presence_of_element_located(( By.XPATH, '//*[@id="irc_cc"]/div[2]/div[1]/div[2]/div[2]/a/img' ))) # this is the current one. time.sleep(4) #4 src = element.get_attribute('src') print("This is the 'src': ", src) time.sleep(4) #4 # Create direcory named using the current serp_id # There will be one serp_id per keyword search current_directory = str(serp_id) + '/' path = os.path.join(target_directory, current_directory) if not os.path.exists(path): os.makedirs(path) image_path = path + imageFile with open(os.path.join(path, imageFile), 'wb') as q: # get image from the link res = requests.get(src) # write the image to file in chunks for chunk in res.iter_content(100000): q.write(chunk) except Exception as e: # Recently image collection has been buggy inconsistenly. # If image download fails, this exception will take a # screenshot and save to the main directory. driver.save_screenshot(imageFile + '.png') pass driver.close() driver.quit() else: image_path = None else: image_height = None image_width = None image_path = None # Create a new record into search_engine_results table sql = "INSERT INTO search_engine_results (link, title, snippet, visible_link, rank, link_type, serp_id, has_image, image_dims, image_height, image_width, news_date, news_source, image_path, top_stories) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" cur.execute(sql, (link.link, link.title, link.snippet, link.visible_link, link.rank, link.link_type, serp_id, link.has_image, link.image_dims, image_height, image_width, link.news_date, link.news_source, image_path, link.top_stories)) db.commit() finally: cur.close() db.close()
def search(self, rand=True, timeout=15): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ import requests success = True self.build_search() if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) try: super().detection_prevention_sleep() super().keyword_info() try: rurl = self.base_search_url + urlencode(self.search_params) except UnicodeError: return False if self.search_engine_name == 'google' and self.config.get( 'strict'): rurl = rurl + '&tbs=li:1' proxy_chain_ips = self.config.get('proxy_chain_ips') if proxy_chain_ips != "local": proxy_chain_ips = proxy_chain_ips.split(',') else: proxy_chain_ips = [] proxies_ch = [] for proxy in proxy_chain_ips: proxies_ch.append({ 'http': 'http://' + proxy, 'https': 'http://' + proxy, }) x = 0 while x < 100: if len(proxies_ch): proxies = choice(proxies_ch) else: proxies = {} # print (proxies) #try: #print (self.headers) session = requests.Session() session.trust_env = False try: if not self.config.get('check_proxies'): request = session.get(rurl, headers=self.headers, timeout=timeout) else: request = session.get(rurl, proxies=proxies, headers=self.headers, timeout=timeout) except self.requests.ConnectionError as ce: request.close() continue except self.requests.Timeout as te: request.close() continue except self.requests.exceptions.RequestException as e: request.close() continue logger.warning(str(request.status_code)) if request.status_code == 200: break else: request.close() x = x + 1 self.requested_at = datetime.datetime.utcnow() self.html = request.text logger.debug( '[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params)) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) success = False except self.requests.Timeout as te: self.status = 'Connection timeout {}'.format(te) success = False except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) else: if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success
def search(self, rand=True, timeout=15): """The actual search for the search engine. When raising StopScrapingException, the scraper will stop. When return False, the scraper tries to continue with next keyword. """ success = True self.build_search() if rand: self.headers['User-Agent'] = random_user_agent(only_desktop=True) try: super().detection_prevention_sleep() super().keyword_info() proxy_url = self.format_proxy(self.proxy) request = self.requests.get( self.base_search_url + urlencode(self.search_params), proxies=None if self.proxy is None else { 'http': proxy_url, 'https': proxy_url }, headers=self.headers, timeout=timeout) self.requested_at = datetime.datetime.utcnow() self.html = request.text logger.debug( '[HTTP - {url}, headers={headers}, params={params}'.format( url=request.url, headers=self.headers, params=self.search_params)) except self.requests.ConnectionError as ce: self.status = 'Network problem occurred {}'.format(ce) success = False except self.requests.Timeout as te: self.status = 'Connection timeout {}'.format(te) success = False except self.requests.exceptions.RequestException as e: # In case of any http networking exception that wasn't caught # in the actual request, just end the worker. self.status = 'Stopping scraping because {}'.format(e) else: if (re.search(r'capt?cha', request.url, re.I)): logging.debug('captcha') success = False if not request.ok: self.handle_request_denied(request.status_code) success = False super().after_search() return success