def main(): data_file_path = os.path.abspath(DATA_FILE) if os.path.exists(data_file_path): print("Data for {} already downloaded!".format(BASE_URL)) sys.exit(1) print("Initialization...") options = selenium.webdriver.chrome.options.Options() options.add_argument("--headless") driver = selenium.webdriver.Chrome(options=options) driver.get(BASE_URL) header = driver.find_element_by_tag_name("header") links_to_geosites = header.find_elements_by_tag_name("a") links_to_geosites = [ item.get_attribute("href") for item in links_to_geosites ] print("Scraping {:d} geolocated sites...".format(len(links_to_geosites))) startups = [] for url in links_to_geosites: print("Fetching {}...".format(url)) startups_from_geosite = scrape_geosite(driver, url) print("+ {:d} startup names".format(len(startups_from_geosite))) startups.extend(startups_from_geosite) time.sleep(1) for idx, item in enumerate(startups): startups[idx] = item.strip() print("Scraped {:d} startup names".format(len(startups))) data_content = "\n".join(startups) with open(data_file_path, "w", encoding="utf-8") as fp: fp.write(data_content) print("Result saved to {}".format(data_file_path))
def find_driver(): #prefs = {"profile.managed_default_content_settings.images": 2} #设置无图模式 options = webdriver.ChromeOptions() #options.add_experimental_option("prefs", prefs) options.add_argument("--lang=en") #options.add_argument("--headless") options.add_experimental_option("excludeSwitches", ["enable-logging"]) options.add_argument('ignore-certificate-errors') driver = webdriver.Chrome( options=options, executable_path= 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') return driver
def get_browser(headless): if headless: options = selenium.webdriver.chrome.options.Options() options.set_headless(True) # Disabling scroll bars is important, see # https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487. options.add_argument("--hide-scrollbars") # The Chrome binary is at a nonstandard location on Heroku, # see [1]. # # [1]: https://github.com/heroku/heroku-buildpack-google-chrome. binary = os.environ.get("GOOGLE_CHROME_SHIM") if binary: options.binary_location = binary return selenium.webdriver.Chrome(chrome_options=options) else: return selenium.webdriver.Chrome()
def get_browser(): """ Return a Selenium browser object. Whether it is headless is controlled by the 'headless' config var. """ if util.get_env_boolean("headless"): options = selenium.webdriver.chrome.options.Options() options.headless = True # Disabling scroll bars is important, see # <https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487>. options.add_argument("--hide-scrollbars") # The Chrome binary is at a nonstandard location on Heroku, # see <https://github.com/heroku/heroku-buildpack-google-chrome>. binary = os.environ.get("GOOGLE_CHROME_SHIM") if binary: options.binary_location = binary return selenium.webdriver.Chrome(options=options) return selenium.webdriver.Chrome()
def __init__(self, headless=False, debug=False, logging_queue=None): self.logger = logging.getLogger("RiskExamAutomaton") options = selenium.webdriver.chrome.options.Options() self.debug = debug if headless: options.add_argument("--headless") options.add_argument("--window-size=1024,768") options.add_argument("--disable-gpu") self.is_headless = headless if self.debug: self.driver = webdriver.Remote( "http://10.3.1.181:9515", desired_capabilities=options.to_capabilities()) else: self.driver = webdriver.Remote( "http://10.3.1.181:4444/wd/hub", desired_capabilities=options.to_capabilities()) self.policy = exampolicy.ExamPolicy() self.skip_list = [] self.init_sqlite() if logging_queue: self.logging_queue = logging_queue else: self.logging_queue = queue.Queue(-1) self.queue_handler = logging.handlers.QueueHandler(self.logging_queue) self.logger.addHandler(self.queue_handler)
def find_driver(): prefs = {"profile.managed_default_content_settings.images": 2} #设置无图模式 options = webdriver.ChromeOptions() options.add_experimental_option("prefs", prefs) options.add_argument("--lang=en") options.add_argument("--headless") options.add_experimental_option("excludeSwitches", ["enable-logging"]) options.add_argument('ignore-certificate-errors') driver = webdriver.Chrome(options=options, executable_path=find_path()[0]) return driver
def chrome_browser(): options = Options() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") # browser = webdriver.Remote(command_executor="http://localhost:4444",desired_capabilities=DesiredCapabilities.CHROME) browser = webdriver.Chrome(ChromeDriverManager().install(), options=options) browser.implicitly_wait(5) browser.maximize_window() return browser
def main(): parser = argparse.ArgumentParser(description='GoogleImageCrawler options', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-s', '--sentence', default='google', type=str, help='Sentence what you want to search. ' 'Default is "google".') parser.add_argument('-d', '--delay', default='1', type=int, help='Sets delay for scrolling. Default is "1" ' 'second.\nBe careful to "0" will put maximum ' 'burden on the server.') parser.add_argument('-o', '--output-directory', default='images', type=str, help='Sets output directory. Default is ' '"images".') parser.add_argument('-dh', '--do-html', default='false', type=str, help='This option will print result of img tags ' 'like html.\nIf you set true then images ' 'will not output.\nDefault is "False".') parser.add_argument('-ss', '--scroll-speed', default=2000, type=int, help='For advanced users!\nScroll speed per ' 'delay.\nDefault is "2000"px.') parser.add_argument('-gl', '--geolocation', default='', type=str, help='For advanced users!\nSets geolocation ' 'code.\nDefault is blank because Google ' 'guess it from ip or get from your account ' 'settings.\nIt can affect search ' 'results.\nCode list is here\n ' '"https://developers.google.com/custom' '-search/docs/xml_results_appendices' '#countryCodes".') parser.add_argument('-it', '--image-type', default='', type=str, help='For advanced users!\nSets image type.\n' 'Default is blank.\nValid types are ' '"clipart", "face", "lineart", "stock", ' '"photo", "animated".') parser.add_argument('-sp', '--safe-parameter', default='off', type=str, help='For advanced users!\nSets safe ' 'parameter.\nDefault is ' '"off".\nValid parameters are "off", ' '"medium", "high".') argv = parser.parse_args() delay = argv.delay html = argv.do_html directory = argv.output_directory scroll_speed = argv.scroll_speed gl = argv.geolocation it = argv.image_type safe = argv.safe_parameter if not os.path.exists(directory): os.makedirs(directory) query = argv.sentence.split() query = '+'.join(query) url = 'https://www.google.co.jp/search?tbm=isch&hl=ja&q=' + urllib.parse.quote_plus(query, encoding='utf-8') if gl != "": url += '&gl=' + gl if it != "": url += '&imgType=' + it if safe != "": url += '&safe=' + safe print('Starting crawl at "' + url + '".') options = selenium.webdriver.chrome.options.Options() options.add_argument('--headless') options.add_argument('--start-maximized') options.add_argument('--no-sandbox') options.add_argument("--disable-setuid-sandbox") options.add_argument('--disable-extensions') driver = selenium.webdriver.Chrome(options=options) driver.get(url) progress_icon = queue.Queue() progress_icon.put('|') progress_icon.put('/') progress_icon.put('-') progress_icon.put('\\') while True: pi = progress_icon.get() progress_icon.put(pi) print('\r' + pi + ' ' + str( len(driver.find_elements_by_xpath('//img[@class="rg_i Q4LuWd"]'))) + ' images found', end='') time.sleep(delay) element = driver.find_element_by_xpath('//input[@value="結果をもっと表示"]') if element.is_displayed(): element.find_element_by_xpath('//input[@value="結果をもっと表示"]').click() elif driver.find_element_by_xpath('//div[text()="未読はありません"]').is_displayed(): break else: driver.execute_script('window.scrollTo(0,' + str(scroll_speed) + ');') scroll_speed += 2000 soup = bs4.BeautifulSoup(driver.page_source, 'html.parser') driver.quit() images = soup.select('img[class="rg_i Q4LuWd"]') print('\rCompleted crawl\nResult:' + str(len(images)) + ' images found') if html != 'true' and html != "True": print('Starting download images') for index, img in enumerate(images): if img.get('data-src') is not None: img['src'] = img['data-src'] if html != 'true' and html != "True": src = str(img['src']) pi = progress_icon.get() progress_icon.put(pi) count = str(index + 1) print('\r' + pi + ' ' + count + '/' + str(len(images)), end='') if src.find('data:image/') != 0: response = requests.get(src) image = response.content with open(directory + '/' + count, 'wb') as im: im.write(image) mime = magic.Magic(mime=True) ext = mime.from_file(directory + '/' + count) os.rename(directory + '/' + count, directory + '/' + count + '.' + ext[ext.rfind('/') + 1:]) else: ext = '.' + src[src.find('/') + 1:src.find(';')] data = src[src.find(','):] with open(directory + '/' + count + ext, 'wb') as im: im.write(base64.b64decode(data)) imgs = '\n'.join(map(str, images)) if html != 'true' and html != "True": print('\rComplete download images') if html == 'true' or html == 'True': print('Starting print html') with open(query + '.html', mode='w') as f: f.write(imgs) print('Complete print')
def browser(config_browser, config_wait_time, request): # Initialize WebDriver if config_browser == 'chrome': options = Options() options.add_argument('log-level=3') options.add_argument("--window-size=1920,1080") options.add_argument("--disable-extensions") options.add_argument("--proxy-server='direct://'") options.add_argument("--proxy-bypass-list=*") options.add_argument("--start-maximized") options.add_experimental_option('excludeSwitches', ['enable-logging']) options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') options.add_argument('--no-sandbox') options.add_argument('--ignore-certificate-errors') options.add_argument( '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"' ) # chrome_options.add_argument("--window-size=1920x1080") # options=options driver = Chrome(Browser_path, options=options) # else: # raise Exception(f'"{config_browser}" is not a supported browser') # Wait implicitly for elements to be ready before attempting interactions driver.implicitly_wait(config_wait_time) driver.maximize_window() # Return the driver object at the end of setup yield driver # For cleanup, quit the driver driver.quit()
'capital-histo-description' ).text # on lui dit de continuer normalement capital_social.append(cap_soc) except NoSuchElementException: capital_social.append('NaN') print(capital_social) browser.quit() # récupération du prix moyen sur pages jaunes : # série d'options pour notre webdriver comme navigation en mode privé, bloquer les pops ups et publicités (mais pas les cookies ...) options = webdriver.ChromeOptions() options.add_argument("private") options.add_argument("--start-maximized") options.add_argument("--ignore-certificate-errors") options.add_argument("--disable-popup-blocking") options.add_argument("--incognito") options.add_argument("--headless") browser = webdriver.Chrome( executable_path= "C:/Users/GUILLOT Robin/Documents/Robin Ensae/Matières/Python//chromedriver", options=options) browser.get('https://www.pagesjaunes.fr/activites') cookie = browser.find_element_by_id("didomi-notice-agree-button").click()
def follow_private_page(): follow = browser.find_element_by_xpath( '//*[@id="react-root"]/section/main/div/header/section/div[1]/div[1]/div/div/button' ) if follow.text != 'Follow': pass else: follow.click() #------------------------------------------------------------------------------ options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument('--ignore-ssl-errors') # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- main -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* browser = open_browser("https://www.instagram.com/") username_input = type_in_box_by_name("username", user) password_input = type_in_box_by_name("password", passw) ClickLoginBox_ClickNotNowNotification() search_and_select_account('mister_programmer_') open_post(1) click_plus_icon_load_comments()
list_of_dates = [ datetime.strptime(all_list[x]['buisday'], '%Y-%m-%d') for x in range(0, len(all_list)) ] #Find the last made transaction latest_transactions = max(list_of_dates) diff = datetime.now() - latest_transactions # Date between the last input in all transactions days_back = diff.days else: print('All list was empty, taking 365 days') days_back = 365 # Set headless options = webdriver.ChromeOptions() options.add_argument('headless') #This option doesn't seem to work, as it downloads to the python folder instead. options.add_argument( 'download.default_directory=/Users/albinjonfelt/Documents/programmering/aktier/bin' ) browser = webdriver.Chrome(options=options) print("Running chrome headless") # open the browser browser.get('https://www.nordnet.se/se') # login browser.find_elements_by_class_name('sv-font-button-white')[0].click() browser.implicitly_wait(7) open_login_button = browser.find_element_by_xpath( "/html/body/div[1]/section/section[2]/section/section/section/div[2]/div/button"