def __init__(self, *args, **kwargs): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options)
def driver(): driver = webdriver.Firefox(firefox_options=Options(), projectname="Examples", jobname=None) yield driver driver.quit()
""" For now this scraper is standalone """ import json from bs4 import BeautifulSoup from selenium import webdriver import requests from selenium.webdriver.firefox.options import Options options = Options() options.headless = True driver = webdriver.Firefox(options=options) courses_data = {} course_code_file = open("course_codes.txt", "r") for line in course_code_file.readlines(): course = line.split(" ")[0] print(course) url = "https://www.handbook.unsw.edu.au/undergraduate/courses/2021/{}".format( course) try: driver.get(url) inner = driver.find_element_by_class_name("OverviewInner") print("Getting course:", course) buttons = inner.find_elements_by_tag_name("button") buttons[0].click()
def __init__(self, client="firefox", username="******", proxy=None, command_executor=None, loadstyles=True, profile=None, headless=False, autoconnect=True, logger=None, extra_params=None, chrome_options=None): """Initialises the webdriver""" self.logger = logger or self.logger extra_params = extra_params or {} if profile is not None: self._profile_path = profile self.logger.info("Checking for profile at %s" % self._profile_path) if not os.path.exists(self._profile_path): self.logger.critical("Could not find profile at %s" % profile) raise WhatsAPIException("Could not find profile at %s" % profile) else: self._profile_path = None self.client = client.lower() if self.client == "firefox": if self._profile_path is not None: self._profile = webdriver.FirefoxProfile(self._profile_path) else: self._profile = webdriver.FirefoxProfile() if not loadstyles: # Disable CSS self._profile.set_preference('permissions.default.stylesheet', 2) # Disable images self._profile.set_preference('permissions.default.image', 2) # Disable Flash self._profile.set_preference( 'dom.ipc.plugins.enabled.libflashplayer.so', 'false') if proxy is not None: self.set_proxy(proxy) options = Options() if headless: options.headless = True options.profile = self._profile capabilities = DesiredCapabilities.FIREFOX.copy() capabilities['webStorageEnabled'] = True self.logger.info("Starting webdriver") self.executable_path = './WebWhatsApi/driver/geckodriver' if platform.system().lower() == "windows": self.executable_path += ".exe" elif platform.system().lower() == "linux": self.executable_path += "-linux" self.executable_path = os.path.abspath(self.executable_path) self.driver = webdriver.Firefox( executable_path=self.executable_path, capabilities=capabilities, options=options, **extra_params) elif self.client == "chrome": self._profile = webdriver.ChromeOptions() if self._profile_path is not None: self._profile.add_argument("user-data-dir=%s" % self._profile_path) if proxy is not None: self._profile.add_argument('--proxy-server=%s' % proxy) if headless: self._profile.add_argument('headless') if chrome_options is not None: for option in chrome_options: self._profile.add_argument(option) self.logger.info("Starting webdriver") self.executable_path = './WebWhatsApi/driver/chromedriver' if platform.system().lower() == "windows": self.executable_path += ".exe" elif platform.system().lower() == "linux": self.executable_path += "-linux" self.executable_path = os.path.abspath(self.executable_path) self.driver = webdriver.Chrome( executable_path=self.executable_path, chrome_options=self._profile, **extra_params) elif client == 'remote': if self._profile_path is not None: self._profile = webdriver.FirefoxProfile(self._profile_path) else: self._profile = webdriver.FirefoxProfile() capabilities = DesiredCapabilities.FIREFOX.copy() self.driver = webdriver.Remote(command_executor=command_executor, desired_capabilities=capabilities, **extra_params) else: self.logger.error("Invalid client: %s" % client) self.username = username self.wapi_functions = WapiJsWrapper(self.driver, self) self.driver.set_script_timeout(500) self.driver.implicitly_wait(10) if autoconnect: self.connect()
def scrap(request): options = Options() options.headless = True driver = webdriver.Firefox(options=options, executable_path='/usr/local/bin/geckodriver') # driver = webdriver.Firefox(options=options,executable_path = '/usr/local/bin/geckodriver') #driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver') driver.get( "https://manage.travel.rakuten.co.jp/portal/inn/mp_kanri_image_up.main" ) time.sleep(3) id1 = driver.find_element_by_name("f_id") id1.send_keys("first-t") pw = driver.find_element_by_name("f_pass") pw.send_keys("first-75") submit = driver.find_element_by_xpath( "/html/body/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[2]/td/form/table/tbody/tr[2]/td[3]/input" ) submit.click() driver.find_element_by_xpath( "/html/body/table[2]/tbody/tr/td[3]/table[3]/tbody/tr[2]/td[1]/table/tbody/tr[3]/td/table/tbody/tr[1]/td[4]/input[1]" ).click() jp_date = driver.find_element_by_xpath( "/html/body/table[11]/tbody/tr/td/table/tbody/tr[1]/td[2]").text jp_date = jp_date.replace('年', '') #Special symbol means Year jp_date = jp_date.replace('月', '') #Special symbol means Month yearmonth = str(jp_date) table = driver.find_element_by_xpath( "/html/body/table[11]/tbody/tr/td/table") my_list = [] #import pickle #with open('filename.pkl',‘wb’) as f: #pickle.dump(table, f) #import pickle #example_dict = {1:"6",2:"2",3:"f"} #pickle_out = open("dict.pickle","wb") #pickle.dump(table, pickle_out) #pickle_out.close() #sys.exit() #table1 = table #fp = open('filename.pkl', 'wb') #pickle.dump(table1, fp) #sys.exit() rows = table.find_elements_by_tag_name( "tr") # get all of the rows in the table my_dict = dict() base = 1 rowcount = 1 #day = 0 for row in rows: #if rowcount == 8: # sys.exit() print("#################### ROW " + str(rowcount) + " #################") # Get the columns (all the column 2) cols = row.find_elements_by_tag_name( "td") #note: index start from 0, 1 is col 2 first_col = cols[0].text first_col_str = str(first_col) fca = first_col_str.split(':') first_col_first = fca[0] if 1 < len(fca): first_col_second = fca[1] tr_tuple_new = (1, 2, 3, 13, 14, 24, 25, 35, 36, 46, 47, 57) tr_tuple_date_new = (4, 15, 26, 37, 48) colcount = 1 #print("First Col Str: " + first_col_str) #if rowcount not in tr_tuple_new and first != '': if rowcount not in tr_tuple_new: if rowcount in tr_tuple_date_new: print("Hello") for col in cols: if col != cols[0] and col.text != '' and col.text != ' ': base = col.text break if rowcount not in tr_tuple_date_new: #base = base[:-1] base = str(base) print("Base: " + base) base = base.replace('日', '') # Special symbol means "day" base = int(base) print("Base" + str(base) + "BASE") #print(base) day = 0 for col in cols: if col != cols[ 0] and col.text != '' and col.text != ' ' and col.text != '済': # This special symbol means 'already' print("###### COLUMN " + str(colcount) + " ######") print("Room Type ID") print(first_col_first) print("Room Type Name") print(first_col_second) #print("Base") #print(base) print("Stock") print(col.text) coltextarray = col.text.split('/') col_text_first = coltextarray[0] col_text_second = coltextarray[0] print("Date") date = base + day if date < 10: date = "0" + str(date) else: date = str(date) #print("day: " + days) fulldate = yearmonth + date print(fulldate) my_dict = { "date": fulldate, "hotel_id": 4304, "room_type_id": first_col_first, "room_type_name": first_col_second, "room_stock": col_text_first, "reservations": col_text_second } my_list.append(my_dict) day += 1 colcount += 1 rowcount += 1 driver.quit() #print(my_list) my_json = json.dumps(my_list, ensure_ascii=False) return HttpResponse(my_json)
# parsear conteudo HTML soup = BeautifulSoup(html_content, 'html.parser') table = soup.find(name='table') # Estruturar conteudo em um dataframe df_full = pd.read_html(str(table))[0].head(10) df = df_full[['Unnamed: 0', 'PLAYER', 'TEAM', label]] df.columns = ['pos', 'player', 'team', 'total'] # transformar os dados em um dicionário de dados prórpio return df.to_dict('records') # instanciar Firefox option = Options() option.headless = True # driver = webdriver.Firefox(options=option) driver = webdriver.Firefox() driver.get(url) time.sleep(2) for k in rankings: top10ranking[k] = buildrank(k) driver.quit() # converter e salvar em JSON js = json.dumps(top10ranking) fp = open('ranking.json', 'w')
logger.setLevel(logging.INFO) fh = logging.StreamHandler() fh.setLevel(logging.INFO) # create formatter and add it to the handlers formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) eans = "0600753339886\n5050466796820" logger.info(eans) logger.info(inputpath) opts = Options() opts.headless = True logger.info("Starte FireFox") browser = Firefox(options=opts) try: logger.info("Öffne Startseite") # Startseite browser.get('https://www.bonavendi.de/verkaufen/sammeleingabe.html') time.sleep(10) try: browser.get_screenshot_as_file(inputpath + "/1.png") except WebDriverException: logger.warning("Bild 1 konnte nicht gespeichert werden.")
def __init__(self): self.opts = Options() self.opts.add_argument("--headless") # To make firefox invisible of course (Headless) self.browsers = {} # Here we save all the browsers we create so we can control and use later self.useragent = "" self.sessions_file = os.path.join("core","sessions.json")
def get_driver(): options = Options() options.headless = True driver = webdriver.Firefox( executable_path=GeckoDriverManager().install(), options=options) return driver
def setUpClass(cls): options = Options() # options.headless = True cls.driver = webdriver.Firefox(options=options) print("in setUpClass")
from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from selenium.webdriver.firefox.options import Options import time WINDOW_SIZE = "1920,1080" firefox_options = Options() firefox_options.add_argument("--headless") firefox_options.add_argument("--window-size=%s" % WINDOW_SIZE) firefox_options.set_preference("browser.download.folderList", 2) firefox_options.set_preference("browser.download.dir", r"C:\Users\tians4") firefox_options.set_preference("browser.download.useDownloadDir", True) firefox_options.set_preference("browser.download.folderList", 2) firefox_options.set_preference("browser.download.manager.showWhenStarting", False) firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") def test_download_csv(): browser = webdriver.Firefox( executable_path=r'C:\Users\tians4\geckodriver.exe', firefox_options=firefox_options) browser.get("https://jira.cec.lab.emc.com:8443/login.jsp") #time.sleep(10) browser.implicitly_wait(10) elem_login = browser.find_element_by_id("login-form-username") elem_login.click()
def build_options(): options = Options() options.headless = True return options
def start_web_driver(): from selenium.webdriver.firefox.options import Options options = Options() options.headless = True driver = webdriver.Firefox(options=options) return driver
def browser(self, userlist, index, channel, command_list): def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n] try: with open(userlist, "r") as _namelist: if len(_namelist.readlines()) > 0: self.browser_status[index] = "Starting" _namelist.seek(0) _namelist_stripped = sorted( map(str.strip, _namelist.readlines())) chunked_lists = chunks(_namelist_stripped, self.chunk_size) for chunk in chunked_lists: self.thread_lock.acquire() profile = webdriver.FirefoxProfile( self.config["Firefox_profile"]) profile.set_preference( "security.insecure_field_warning.contextual.enabled", False) profile.set_preference("security.enterprise_roots.enabled", True) options = Options() if index != 0 and self.headless_mode: options.add_argument('--headless') with webdriver.Firefox( options=options, executable_path= "FirefoxPortable/App/Firefox64/geckodriver.exe", firefox_profile=profile, firefox_binary= "FirefoxPortable/App/Firefox64/firefox.exe") as driver: # print(driver.profile.profile_dir) self.thread_lock.release() driver.set_window_size(1000, 1000) wait = WebDriverWait(driver, 120) wait_rules = WebDriverWait(driver, 5) driver.get( "https://www.twitch.tv/popout/{channel}/chat".format( channel=channel)) chat_field = wait.until( presence_of_element_located( (By.CSS_SELECTOR, ".ScInputBase-sc-1wz0osy-0"))) chat_welcome_message = wait.until( presence_of_element_located( (By.CSS_SELECTOR, ".chat-line__status"))) time.sleep(1) if chat_field.is_displayed(): chat_field.click() try: # remove rules window rules_button = wait_rules.until( presence_of_element_located( (By.CSS_SELECTOR, ".jQtUJo"))) if rules_button.is_displayed(): rules_button.click() except (NoSuchElementException, TimeoutException): pass if chat_field.is_displayed(): chat_field.click() chat_field = wait.until( presence_of_element_located( (By.CSS_SELECTOR, ".ScInputBase-sc-1wz0osy-0"))) chat_field.send_keys( f"{self.greeting_emote} {index} {self.greeting_emote}", Keys.ENTER) self.browser_status[index] = "Ready" while not self.all_browsers_ready: time.sleep(0.1) with open("banned_part{index}.txt".format(index=index), "w") as banned_names: for _name in chunk: try: for command in command_list: chat_field = wait.until( presence_of_element_located( (By.CSS_SELECTOR, ".ScInputBase-sc-1wz0osy-0"))) chat_field.send_keys( "{cmd} {name}".format(cmd=command, name=_name), Keys.ENTER) banned_names.write(f"{_name}\n") self.counter[index] += 1 except (ElementNotInteractableException, ElementClickInterceptedException): try: # remove rules window again, if nescessary rules_button = wait_rules.until( presence_of_element_located( (By.CSS_SELECTOR, ".jQtUJo"))) if rules_button.is_displayed(): rules_button.click() except (NoSuchElementException, TimeoutException): pass with self.thread_lock: with open( "banned_lists/{streamer}.txt".format( streamer=channel), "a") as banlist, open( "banned_part{index}.txt".format(index=index), "r") as banned_names: _names = banned_names.readlines() banlist.writelines(_names) except LookupError: print("couldn't start instance {}".format(index)) finally: self.browser_status[index] = "Done"
_LOGGER = logging.getLogger(__name__) _LOGGER.setLevel(logging.DEBUG) logging.debug("test") HTML_PARSER = 'html.parser' ATTRIBUTION = 'Information provided by Aesop' LOGIN_URL = 'https://sub.aesoponline.com/Substitute/Home' LOGIN_TIMEOUT = 10 COOKIE_PATH = './aesop_cookies.pickle' CACHE_PATH = './aesop_cache' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' CHROME_WEBDRIVER_ARGS = [ '--headless', '--user-agent={}'.format(USER_AGENT), '--disable-extensions', '--disable-gpu', '--no-sandbox' ] CHROMEDRIVER_PATH = 'C:/Users/asaboo/Downloads/chromedriver_76/chromedriver' FIREFOXOPTIONS = Options() FIREFOXOPTIONS.add_argument("--headless") class AESOPError(Exception): """AESOP error.""" pass def _save_cookies(requests_cookiejar, filename): """Save cookies to a file.""" with open(filename, 'wb') as handle: pickle.dump(requests_cookiejar, handle)
def setup(self): self.opts = Options() self.opts.set_headless() browser = Firefox(options=self.opts) browser.get(self.url) return browser
def main(): # Parse the command line arguments models = [ 'hash', 'rr', 'random', 'cloudflare', 'google', 'quad9', 'nextdns' ] parser = argparse.ArgumentParser() parser.add_argument('website') parser.add_argument('dns_type', choices=['dns', 'doh', 'dot', 'dnscrypt-proxy_doh']) parser.add_argument('trr_resolver_ip') parser.add_argument('trr_resolver_uri') parser.add_argument('model', choices=models) parser.add_argument('--timeout', type=int, default=45) args = parser.parse_args() dnscrypt_config_file = '/dnscrypt-proxy/dnscrypt-proxy/dnscrypt-proxy-{0}.toml'.format( args.model) # Enable devtools in Firefox options = Options() options.headless = True options.add_argument('-devtools') # Enable the netmonitor toolbox in devtools so we can save HARs profile = webdriver.FirefoxProfile() profile.set_preference('devtools.toolbox.selectedTool', 'netmonitor') # Set up DNS configuration subprocess.run( ["sudo", "cp", "/etc/resolv.conf", "/etc/resolv.upstream.conf"]) subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"]) if args.dns_type == 'dnscrypt-proxy_doh': subprocess.run( "sudo /dnscrypt-proxy/dnscrypt-proxy/dnscrypt-proxy -config {0} &> /dev/null &" .format(dnscrypt_config_file), shell=True) subprocess.run(["sudo", "sleep", "5s"]) # Configure the DNS settings in Firefox if args.dns_type == 'dns' or args.dns_type == 'dot' or args.dns_type == 'dnscrypt-proxy_doh': options.set_preference('network.trr.mode', 0) elif args.dns_type == 'doh': options.set_preference('network.trr.mode', 3) options.set_preference('network.trr.request-timeout', 1500) options.set_preference('network.trr.max-fails', 5) trr_resolver_ip = args.trr_resolver_ip trr_resolver_uri = args.trr_resolver_uri if trr_resolver_ip: options.set_preference('network.trr.bootstrapAddress', trr_resolver_ip) if trr_resolver_uri: options.set_preference('network.trr.uri', trr_resolver_uri) # Launch Firefox and install our extension for getting HARs driver = webdriver.Firefox(options=options, firefox_profile=profile, firefox_binary="/opt/firefox/firefox-bin") driver.install_addon("/home/seluser/measure/harexporttrigger-0.6.2-fx.xpi") driver.set_page_load_timeout(args.timeout) # Make a page load started = datetime.now() driver.get(args.website) # Once the HAR is on disk in the container, write it to stdout so the host machine can get it har_file = "/home/seluser/measure/har.json" def har_file_ready(): return os.path.exists(har_file + ".ready") while (datetime.now() - started).total_seconds() < args.timeout \ and not har_file_ready(): time.sleep(1) if har_file_ready(): with open(har_file, 'rb') as f: sys.stdout.buffer.write(f.read()) driver.quit()
def main(cmd_args): group_id = cmd_args.group result_filename = cmd_args.out local = pytz.timezone("Europe/Moscow") study_calendar = Calendar() try: print(Fore.WHITE, Style.BRIGHT, "Checking for the existence of a " "group...", Style.RESET_ALL, end="") if not exist_group(group_id): raise NoSuchGroupID(group_id) print(Fore.GREEN, Style.BRIGHT, " Ok", Style.RESET_ALL) opts = Options() opts.headless = True browser = Firefox(options=opts) base_url = f"{MAI_SCHEDULE_DETAIL}?group={group_id}" browser.get(base_url) print(Fore.WHITE, Style.BRIGHT, "Number of university weeks:", Style.RESET_ALL, end="") # get all weeks number_study_weeks = len( browser.find_elements_by_css_selector(".table tr a")) print(number_study_weeks) for i in range(1, number_study_weeks + 1): # iterates over weeks print(Fore.WHITE, Style.BRIGHT, f"\nGetting {i} week schedule...", Style.RESET_ALL) browser.get(f"{base_url}&week={i}") # get year of current week stud_weeks = browser.find_elements_by_css_selector(".table tr a") year = stud_weeks[i - 1].text[-4:] stud_days = browser.find_elements_by_class_name("sc-container") for stud_day in stud_days: day, month = stud_day.find_element_by_class_name( "sc-day-header").text[:5].split(".") start_date = f"{year}-{month}-{day}" # YYYY-MM-DD items = stud_day.find_elements_by_css_selector( ".sc-table-detail > .sc-table-row") for item in items: event = Event() start_time, end_time = item.find_element_by_class_name( "sc-item-time").text.split(" – ") event.name = item.find_element_by_class_name( "sc-title").text # get title of study item # convert local begin time to utc naive = datetime.datetime.strptime( f"{start_date} {end_time}", "%Y-%m-%d %H:%M") local_dt = local.localize(naive, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) event.end = utc_dt.strftime("%Y-%m-%d %H:%M") # convert local end time to utc naive = datetime.datetime.strptime( f"{start_date} {start_time}", "%Y-%m-%d %H:%M") local_dt = local.localize(naive, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) event.begin = utc_dt.strftime("%Y-%m-%d %H:%M") type_lesson = item.find_element_by_class_name( "sc-item-type").text location = item.find_element_by_class_name( "sc-item-location").text # get audience in MAI # handle case when lecturer field is empty try: lecturer = item.find_element_by_class_name( "sc-lecturer").text except NoSuchElementException: lecturer = '' event.description = f"Type: {type_lesson}\nLocation: " \ f"{location}\nLecturer: {lecturer}\n" study_calendar.events.add(event) print(Fore.WHITE, Style.BRIGHT, f'\t{start_date} -', Fore.GREEN, '\u2713', Style.RESET_ALL) # save ics file with open(result_filename, "w") as ics_file: ics_file.writelines(study_calendar) print(Fore.GREEN, Style.BRIGHT, f"\n Done! Created {result_filename}\n", Style.RESET_ALL) except NoSuchGroupID as e: print(Fore.RED, Style.BRIGHT, e, Style.RESET_ALL)
def download_gisaid_EpiCoV( uname, # username upass, # password normal, # normal mode (quite) wd, # output dir loc, # location host, # host cs, # collection start date ce, # collection end date ss, # submission start date se, # submission end date cg, # complete genome only hc, # high coverage only le, # low coverage excluding to, # timeout in sec rt, # num of retry iv, # interval in sec meta_dl # also download meta ): """Download sequences and metadata from EpiCoV GISAID""" # output directory if not os.path.exists(wd): os.makedirs(wd, exist_ok=True) wd = os.path.abspath(wd) # GISAID_FASTA = f'{wd}/sequences.fasta.bz2' # GISAID_TABLE = f'{wd}/gisaid_cov2020_acknowledgement_table.xls' GISAID_DTL_JASON = f'{wd}/gisaid_detail_metadata.json' # GISAID_TSV = f'{wd}/metadata.tsv.bz2' metadata = [] # MIME types mime_types = "application/octet-stream" mime_types += ",application/excel,application/vnd.ms-excel" mime_types += ",application/pdf,application/x-pdf" mime_types += ",application/x-bzip2" mime_types += ",application/x-gzip,application/gzip" # start fresh try: os.remove(GISAID_DTL_JASON) except OSError: pass print("Opening browser...") profile = webdriver.FirefoxProfile() profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", wd) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", mime_types) profile.set_preference("plugin.disable_full_page_plugin_for_types", mime_types) profile.set_preference("pdfjs.disabled", True) options = Options() if not normal: options.add_argument("--headless") driver = webdriver.Firefox(firefox_profile=profile, options=options) # driverwait driver.implicitly_wait(20) wait = WebDriverWait(driver, to) # open GISAID print("Opening website GISAID...") driver.get('https://platform.gisaid.org/epi3/frontend') waiting_sys_timer(wait) print(driver.title) assert 'GISAID' in driver.title # login print("Logining to GISAID...") username = driver.find_element_by_name('login') username.send_keys(uname) password = driver.find_element_by_name('password') password.send_keys(upass) driver.execute_script("return doLogin();") waiting_sys_timer(wait) # navigate to EpiFlu print("Navigating to EpiCoV...") epicov_tab = driver.find_element_by_xpath("//div[@id='main_nav']//li[3]/a") epicov_tab.click() waiting_sys_timer(wait) # when user doesn't enter time/location, download nextstrain sequences and metadata if not (cs or ce or ss or se or loc): # download from downloads section print("Clicking downloads...") pd_button = wait.until( EC.element_to_be_clickable( (By.XPATH, "//div[@class='sys-actionbar-bar']//div[3]"))) pd_button.click() waiting_sys_timer(wait) # have to click the first row twice to start the iframe iframe = waiting_for_iframe(wait, driver, rt, iv) driver.switch_to.frame(iframe) waiting_sys_timer(wait) print("Downloading Nextstrain sequences...") dl_button = wait.until( EC.element_to_be_clickable( (By.XPATH, '//div[contains(text(), "nextfasta")]'))) dl_button.click() waiting_sys_timer(wait) fn = wait_downloaded_filename(wait, driver, 3600) print(f"Downloaded to {fn}. ") waiting_sys_timer(wait) print("Downloading Nextstrain metadata...") dl_button = wait.until( EC.element_to_be_clickable( (By.XPATH, '//div[contains(text(), "nextmeta")]'))) dl_button.click() fn = wait_downloaded_filename(wait, driver, 1800) print(f"Downloaded to {fn}. ") waiting_sys_timer(wait) # go back to main frame back_button = wait.until( EC.element_to_be_clickable( (By.XPATH, '//button[contains(text(), "Back")]'))) back_button.click() driver.switch_to.default_content() waiting_sys_timer(wait) # have to reduce the range of genomes if cs or ce or ss or se or loc: print("Browsing EpiCoV...") browse_tab = wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[contains(text(), "Browse")]'))) browse_tab.click() waiting_sys_timer(wait) waiting_table_to_get_ready(wait) # set location if loc: print("Setting location...") loc_input = driver.find_element_by_xpath( "//td/div[contains(text(), 'Location')]/../following-sibling::td/div/div/input" ) loc_input.send_keys(loc) waiting_sys_timer(wait, 7) # set host if host: print("Setting host...") host_input = driver.find_element_by_xpath( "//td/div[contains(text(), 'Host')]/../following-sibling::td/div/div/input" ) host_input.send_keys(host) waiting_sys_timer(wait, 7) # set dates date_inputs = driver.find_elements_by_css_selector( "div.sys-form-fi-date input") dates = (cs, ce, ss, se) for dinput, date in zip(date_inputs, dates): if date: print("Setting date...") dinput.send_keys(date) ActionChains(driver).send_keys(Keys.ESCAPE).perform() waiting_sys_timer(wait, 7) # complete genome only if cg: print("complete genome only...") checkbox = driver.find_element_by_xpath( '//input[@value="complete"]') checkbox.click() waiting_sys_timer(wait) # high coverage only if hc: print("high coverage only...") checkbox = driver.find_element_by_xpath('//input[@value="highq"]') checkbox.click() waiting_sys_timer(wait) # excluding low coverage if le: print("low coverage excluding...") checkbox = driver.find_element_by_xpath('//input[@value="lowco"]') checkbox.click() waiting_sys_timer(wait) # check if any genomes pass filters warning_message = None try: warning_message = driver.find_element_by_xpath( "//div[contains(text(), 'No data found.')]") except: pass if warning_message: print("No data found.") sys.exit(1) # select all genomes print("Selecting all genomes...") button_sa = driver.find_element_by_css_selector( "span.yui-dt-label input") button_sa.click() waiting_sys_timer(wait) # downloading sequence retry = 0 while retry <= rt: try: print("Downloading sequences for selected genomes...") button = driver.find_element_by_xpath( "//td[@class='sys-datatable-info']/button[contains(text(), 'Download')]" ) button.click() waiting_sys_timer(wait) # switch to iframe iframe = waiting_for_iframe(wait, driver, rt, iv) driver.switch_to.frame(iframe) waiting_sys_timer(wait) button = driver.find_element_by_xpath( "//button[contains(text(), 'Download')]") button.click() waiting_sys_timer(wait) driver.switch_to.default_content() fn = wait_downloaded_filename(wait, driver, 1800) print(f"Downloaded to {fn}.") break except: print(f"retrying...#{retry} in {iv} sec(s)") if retry == rt: print("Unexpected error:", sys.exc_info()) sys.exit(1) else: time.sleep(iv) retry += 1 # downloading metadata retry = 0 while retry <= rt: try: print( "Downloading acknowledgement table for selected genomes..." ) button = driver.find_element_by_xpath( "//td[@class='sys-datatable-info']/button[contains(text(), 'Download')]" ) button.click() waiting_sys_timer(wait) # switch to iframe iframe = waiting_for_iframe(wait, driver, rt, iv) driver.switch_to.frame(iframe) waiting_sys_timer(wait) label = driver.find_element_by_xpath( "//label[contains(text(), 'Acknowledgement Table')]") label.click() button = driver.find_element_by_xpath( "//button[contains(text(), 'Download')]") button.click() waiting_sys_timer(wait) driver.switch_to.default_content() fn = wait_downloaded_filename(wait, driver, 180) print(f"Downloaded to {fn}.") break except: print(f"retrying...#{retry} in {iv} sec(s)") if retry == rt: print("Unexpected error:", sys.exc_info()) sys.exit(1) else: time.sleep(iv) retry += 1 # iterate each pages if meta_dl: page_num = 1 print("Retrieving metadata...") while True: print(f"Starting processing page# {page_num}...") # retrieve tables tbody = wait.until( EC.presence_of_element_located( (By.XPATH, "//tbody[@class='yui-dt-data']"))) waiting_table_to_get_ready(wait) # interate each row for tr in tbody.find_elements_by_tag_name("tr"): td = tr.find_element_by_tag_name("td") driver.execute_script("arguments[0].scrollIntoView();", td) # have to click the first row twice to start the iframe iframe = None record_elem = None retry = 1 while retry <= rt: try: td.click() waiting_sys_timer(wait) iframe = driver.find_element_by_xpath("//iframe") if iframe: break else: raise except: print(f"retrying...#{retry} in {iv} sec(s)") if retry == rt: print("Failed") sys.exit(1) else: time.sleep(iv) retry += 1 driver.switch_to.frame(iframe) # detect error: "An internal server error occurred." # and "error-token: DYX47" error_token = driver.find_element_by_xpath("//b") if error_token: error_token_text = error_token.text if "error-token" in error_token.text: print( "[FATAL ERROR] A website internal server error occurred." ) print(error_token_text) sys.exit(1) # get the element of table with metadata record_elem = wait.until( EC.presence_of_element_located( (By.XPATH, "//div[@class='packer']"))) # parse metadata m = getMetadata(record_elem) metadata.append(m) print(f"{m['Accession ID']}\t{m['Virus name']}") # get back ActionChains(driver).send_keys(Keys.ESCAPE).perform() time.sleep(1) driver.switch_to.default_content() print(f"Compeleted page# {page_num}.") page_num += 1 # go to the next page retry = 1 button_next_page = None try: button_next_page = driver.find_element_by_xpath( f'//a[@page="{page_num}"]') except: break if button_next_page: print(f"Entering page# {page_num}...") while retry <= rt: try: button_next_page.click() time.sleep(10) current_page = driver.find_element_by_xpath( '//span[@class="yui-pg-current-page yui-pg-page"]' ).text if current_page != str(page_num): raise else: break except: print(f"retrying...#{retry} in {iv} sec(s)") if retry == rt: print("Failed") sys.exit(1) else: time.sleep(iv) retry += 1 # writing metadata to JSON file print("Writing detail metadata...") with open(GISAID_DTL_JASON, 'w') as outfile: json.dump(metadata, outfile) # close driver driver.quit()
def webdriver_init(): firefox_options = Options() firefox_options.headless = hide_firefox driver = webdriver.Firefox(options=firefox_options ) return driver
city_list = region_dict[regione_name] else: print('specifica: Parametro Regione giorno mese anno giorno mese anno') sys.exit() regione_name = 'Lombardia' parametro = 'Precipitazioni' gi = '1' mi = '1' ai = '2010' gf = '1' mf = '1' af = '2011' city_list = region_dict['Lombardia'] url = "http://clima.meteoam.it/RichiestaDatiGenerica.php" p = Path(os.path.realpath(__file__)) parent = p.parent.parent.parent driver_path = os.path.join(parent, "geckodriver") optionsFire = Options() optionsFire.add_argument('--headless') html_list = [] for c in city_list: print('controllo ' + c) aeronatutica(parametro, c, gi, mi, ai, gf, mf, af, html_list) if (len(html_list) != 0): filename = regione_name + ai + af + '.csv' finalParsing(html_list, filename) print('ci sono risultati')
# Siempre en los programas de selenium desde Python import time from selenium import webdriver from selenium.webdriver.firefox.options import Options # Configurar el driver opciones = Options() opciones.headless = True # Haga todo el trabajo sin mostrar fisicamente la pantalla del navegador. Con esta opcion no tiene sentido tener sleep. navegador = webdriver.Firefox( executable_path="./drivers/geckodriver", options=opciones ) # Utiliza geckodriver para abrir un navegador con Firefox. navegador.set_window_position(0, 0) navegador.set_window_size(800, 500) # Abrir el navegador en una ruta navegador.get("http://google.es") # Identificaremos los elementos y actuaremos sobre ellos navegador.find_element_by_xpath("//input[@type='text']").send_keys( "Sevilla" ) # Send keys escribe un texto. El elemento que hemos seleccionado es la sección de buscar en Google. time.sleep(2) navegador.find_element_by_xpath("//input[@name='btnK']").click() time.sleep(3) estadisticas = navegador.find_element_by_xpath("//div[@id'result-stats']").text print(estadisticas) # Cerrar el navegador navegador.quit()
def options(): return Options()
def publicstorage_scrap(zip_code_lst, unique_url_lst): for zip in zip_code_lst: try: url = "https://www.publicstorage.com/self-storage-search?location=" + str( zip) t = random.randint(25, 30) time.sleep(t) options = Options() options.headless = True root_dir = os.path.dirname(os.path.abspath(__file__)) try: driver = webdriver.Firefox(options=options, executable_path=root_dir + '/geckodriver') driver.get(url) time.sleep(3) content = soup(driver.page_source, 'html.parser') url_lst = content.findAll( "a", {"class": "ps-property-v2__view-plp"}) ur_lst = [] for urls in url_lst: try: url = "https://www.publicstorage.com" + str( urls['href']) if url in unique_url_lst: continue else: unique_url_lst.append(url) ur_lst.append(url) except: pass for i in range(len(ur_lst)): try: storage_link = ur_lst[i] t = random.randint(10, 15) time.sleep(t) driver.get(storage_link) time.sleep(3) content = soup(driver.page_source, 'html.parser') address = '' addr_lst = [] addr_lst = re.findall(r'"FormattedAddress":"(.*?)","', content.text)[0].split(" ") for addr in addr_lst: if len(addr.strip()) > 0: address = address + " " + addr.strip() address = address.strip() addr_zip_code = address.split(" ")[-1] dt_lst = content.findAll( "div", { "class": "row ps-properties-propertyV2__units__summary" }) for index_data in dt_lst: try: size_type = '' size_type = index_data.find( "h4", { "class": "ps-properties-propertyV2__units__header" }).text.strip() price_txt = '' price_txt = index_data.find( "span", { "class": "ps-properties-propertyV2__units__prices__wrapper" }) price = price_txt.text.strip().split('/')[0] print(size_type + ", " + price + ", " + address + ", " + addr_zip_code + ", " + storage_link) isexit = False sql = "SELECT * FROM tbl_publicstorage WHERE address = %s AND price = %s AND size = %s AND link = %s" adr = (address, price, size_type, storage_link) mycursor.execute(sql, adr) myresult = mycursor.fetchall() for x in myresult: isexit = True if isexit == False: sql = "INSERT INTO tbl_publicstorage (address, price,size,zipcode,link) VALUES (%s, %s,%s, %s,%s)" val = (address, price, size_type, addr_zip_code, storage_link) mycursor.execute(sql, val) mydb.commit() except: pass except: pass except: pass try: driver.quit() except: pass except: pass
def query_storage_sync(): """ query firefox's browser.storage.sync to pomodoro data from firefox "tomator clock" plugin """ config = configparser.ConfigParser() config.read('config.ini') options = Options() options.headless = True profile = webdriver.FirefoxProfile(config['pomodoros']['profile_path']) driver = webdriver.Firefox(firefox_profile=profile, options=options) driver.implicitly_wait(100) try: url = config['pomodoros']['url'] output_path = config['pomodoros']['output_path'] driver.get(url) query = """ const getStorageData = key => new Promise((resolve, reject) => browser.storage.sync.get(key, result => browser.runtime.lastError ? reject(Error(browser.runtime.lastError.message)) : resolve(result) ) ) const timeline = getStorageData('timeline') return timeline """ output = driver.execute_script(query.strip()) if not output: driver.quit() raise ValueError("results are empty!") return except Exception as e: driver.quit() raise e elements = filter(lambda d: d['type'] == 'tomato', output['timeline']) days = defaultdict(int) minutes = defaultdict(int) for element in elements: date_obj = datetime.strptime( element['date'], "%a %b %d %Y %H:%M:%S %Z%z (Eastern Daylight Time)") minute = datetime.strftime(date_obj, "%Y-%m-%d %H:%M") day = datetime.strftime(date_obj, "%Y-%m-%d") days[day] += 1 minutes[minute] = days[day] override_output_path = False if not os.path.exists(output_path): override_output_path = True else: with open(output_path, "r") as r: r.readline() # skip header processed = dict(map(lambda x: x.strip().split(','), r.readlines())) if override_output_path or len(processed.keys() & days.keys()) >= len(processed): with open(output_path, "w") as w: w.write("date,value\n") for d, v in sorted(days.items()): w.write(f"{d},{v}\n") with open("all_pomodoros.bsv", "w") as w: w.write("time,value\n") for d, v in sorted(minutes.items()): w.write(f"{d},{v}\n") else: print("something's up") driver.quit() return
def index(): if (request.method == 'POST' and request.form.get('choices-single-default') == 'Twitter'): query = request.form['query'] url = 'https://mobile.twitter.com/hashtag/' + query page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') tweet = [] newlink = [] for tag in soup.find_all('a'): try: temp = (tag['href']) temp = temp[:-4] temp = 'https://twitter.com' + temp tweet.append(temp) except: pass for element in tweet: if 'status' in element: element = re.sub(':', '%3A', element) element = re.sub('/', '%2F', element) newlink.append(element) newlink = list(set(newlink)) options = Options() options.headless = True options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') browser = webdriver.Firefox(options=options) # # Step 2) Navigate to Facebook final = [] newlink1 = [] for element in newlink: y = re.search(".*[0-9]$", element) if (y): if "=" not in element and "?" not in element and "_" not in element: newlink1.append(element) for element in newlink1: try: url1 = 'https://publish.twitter.com/?query=' + element + '&widget=Tweet' browser.get(url1) x = browser.find_element_by_xpath( '//code[@class="EmbedCode-code"]') final.append(x.text) except: pass f = open("templates/twitter.html", "w", encoding='utf-8') f.write( '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><style>body{display: table;margin: auto;}</style><link rel="icon" href="https://cdn0.iconfinder.com/data/icons/social-flat-rounded-rects/512/twitter-512.png" type="image/icon type"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Twitter</title></head><body>' ) f.write( '<img src="{{ url_for("static", filename="images/twitter-logo.png")}}" alt="logo" height="60px" width="200px"><br>' ) f.write("<h1>Showing results for " + query + "</h1>") for element in final: f.write(element) f.write("<br>") f.write("</body></html>") f.close() browser.quit() return render_template('twitter.html') elif (request.method == 'POST' and request.form.get('choices-single-default') == 'Facebook'): options = Options() options.headless = True options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') browser = webdriver.Firefox(options=options) browser.get("http://www.facebook.com") username = browser.find_element_by_id("email") password = browser.find_element_by_id("pass") submit = browser.find_element_by_id("loginbutton") username.send_keys("*****@*****.**") password.send_keys("facebook@123") submit.click() query = request.form['query'] url = "https://www.facebook.com/search/posts/?q=%23" + query + "&epa=SERP_TAB" browser.get(url) temp = browser.find_elements_by_xpath('//a') link = [] newlink = [] for element in temp: link.append(str(element.get_attribute("href"))) for element in link: try: if "posts" in element and "#" not in element and "=" not in element: newlink.append(element) except: pass newlink = list(set(newlink)) newlink.insert(0, query) browser.quit() #print(newlink) return render_template('facebook.html', data=newlink) elif (request.method == 'POST' and request.form.get('choices-single-default') == 'Instagram'): options = Options() options.headless = True options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') browser = webdriver.Firefox(options=options) query = request.form['query'] url = "https://www.instagram.com/explore/tags/" + query + "/top/" browser.get(url) link = [] link.append(query) #soup = BeautifulSoup(page.text, 'html.parser') #temp = browser.find_elements_by_xpath("//div[contains(concat(' ', normalize-space(@class), ' '), 'KL4Bh')]") temp = browser.find_elements_by_xpath('//img[@class="FFVAD"]') for element in temp: link.append(element.get_attribute("src")) browser.quit() return render_template('insta.html', data=link) else: return render_template('index.html')
def scrape_dea(zips): ''' Purpose: Function that will scrape the DEA Diversion Control Division website in order to obtain an up-to-date listing of all controlled substance public disposal locations. This will be accomplished by entering a zip code and specifying a search radius of 50 miles which will then return an html table of all dropbox addresses within that search radius. Input: zips (list): list of zip codes Output: dropboxDF (Dataframe): Dataframe of dropbox locations ''' count = 0 # Open a Headless Firefox web browser and direct it to the DEA's dropbox search page options = Options() options.set_headless(headless=True) browser = webdriver.Firefox(firefox_options=options) browser.get('https://apps.deadiversion.usdoj.gov/pubdispsearch') browser.implicitly_wait(100) # storage variable for table column names columnNames = [] # final storage container for dropbox locations dropboxList = [] # For every zip code in the US, run the dropbox location search on the site for code in zips: count += 1 if count % 100 == 0: print(count) try: # Input the zip code into the page zipElem = browser.find_element_by_id('searchForm:zipCodeInput') zipElem.clear() # clear the box in case any previous data exists zipElem.send_keys(code) # Specify the maximum radius of 50 miles desired_button = browser.find_element_by_xpath( '/html/body/div[1]/div[2]/div/div/div[2]/form/div[10]/table/tbody/tr/td[7]/div/div[2]/span' ) desired_button.click() # Click the submit button search_button = browser.find_element_by_id( 'searchForm:submitSearchButton') search_button.click() # Use beautifulSoup to extract the dropbox data from the generated page html = browser.page_source soup = BeautifulSoup(html, 'lxml') dropboxTable = soup.findAll('table', role='grid')[0] # On the first iteration, grab column names from the dropbox location table if code == zipList[0]: tableHeader = dropboxTable.find('tr') th = tableHeader.findAll('th') for col in th: columnNames.append(col.text) # For every column in a row in the dropbox location table, grab the data # and place it in the list `rowList`. After each row is read, add that # data to the master list `dropboxList`. for tr in dropboxTable.findAll('tr')[1:]: rowList = [] for td in tr.findAll('td'): rowList.append(td.text) dropboxList.append(rowList) # Move back to the search page and start over browser.back() except: pass return dropboxList
def update(subscribe_list): # load driver and cookies vdis = Xvfb() vdis.start() try: os.remove('geckodriver.log') except: pass options = Options() options.log.level = "trace" driver = webdriver.Firefox(options=options) with open('cookies.json', 'r') as f: cookies = f.read() cookies = json.loads(cookies) driver.get('https://mp.weixin.qq.com/') for i in cookies: driver.add_cookie(i) time.sleep(delay / 3) update_pool = {} try: # open editor page driver.get('https://mp.weixin.qq.com/') get_by_css(driver, '#footer.mp-foot') real_url = driver.current_url if real_url.split('qq.com')[1] == '/': raise ValueError('cookies error!') token = urllib.parse.parse_qs(real_url)['token'][0] editor_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&createType=10&token=' + token + '&lang=zh_CN' driver.get(editor_url) get_by_css(driver, '#js_text_editor_tool_link').click() # search for articles for entry in subscribe_list: update_pool[entry] = [] othergzh_button = get_by_css( driver, '.weui-desktop-btn.weui-desktop-btn_default') if othergzh_button != 0: othergzh_button.click() input_box = get_by_css( driver, '.weui-desktop-form__input_append-in > input') input_box.send_keys(entry) input_box.send_keys(Keys.ENTER) flag = 0 for i in range(5): gzh_entry = get_by_css( driver, 'ul.inner_link_account_list > li:nth-child({})'.format(i + 1)) if gzh_entry == 0: break if get_by_css( driver, 'ul.inner_link_account_list > li:nth-child({}) strong'. format(i + 1)).text == entry: flag = 1 break if flag == 0: update_pool[entry].append({ "title": "no gzh found", "link": "http://example.com", "author": entry, "date": "1970-01-01" }) continue gzh_entry.click() article_entries = get_by_css(driver, '.inner_link_article_item', 1) for article_entry in article_entries: link_element = get_by_css(article_entry, 'span:nth-child(3) > a') title_element = get_by_css( article_entry, 'div.inner_link_article_title > span:nth-child(2)') date_element = get_by_css(article_entry, 'div.inner_link_article_date') link = link_element.get_attribute('href') title = title_element.get_attribute('innerHTML') date = date_element.get_attribute('innerHTML') update_pool[entry].append({ "title": title, "link": link, "author": entry, "date": date }) except ValueError as msg: update_pool = str(msg) finally: pass driver.close() vdis.stop() return (update_pool)
def browser(): options = Options() # options.headless = True browser = webdriver.Firefox(options=options) browser.delete_all_cookies() yield browser
def setUpClass(cls): super().setUpClass() options = Options() options.headless = bool(os.environ.get("CI")) cls.selenium = webdriver.Firefox(options=options)