def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: # wd.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}) wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_face(query: str, max_links_to_fetch: int, target_path: str, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_paths = [] image_with_face_count = 0 results_start = 0 while image_with_face_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): url = actual_image.get_attribute('src') filepath, ok = persist_image(target_path, url) if not ok: continue if faceDetect.isFace(filepath): image_urls.add(url) image_paths.append(filepath) else: os.unlink(filepath) image_with_face_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_paths
def navigate_to_next_week(self, sportjaDriver: webdriver): sportjaDriver.find_element_by_css_selector( "#head > a:nth-child(5)").click() sleep(1) return sportjaDriver
def send_to_field_with(css_sel_or_xpath: str, keys: str, _driver: webdriver): login_field = _driver.find_element_by_css_selector(f'{css_sel_or_xpath}') login_field.clear() login_field.send_keys(keys) return login_field
def send_single_mail(driver: webdriver, email: str, email_subject: str, email_body: list) -> None: # Compose button try: driver.find_element_by_css_selector('.z0>.L3').click() except IndexError: driver.find_element_by_css_selector('.z0>.L3::before').click() sleep(1) # Input Recipient driver.find_element_by_css_selector(".oj div textarea").send_keys(email) sleep(0.5) # Input Subject driver.find_element_by_css_selector(".aoD.az6 input").send_keys( email_subject) sleep(0.5) # Input Text if (len(email_body) > 1): for i in range(len(email_body)): driver.find_element_by_css_selector(".Ar.Au div").send_keys( email_body[i]) bolder(driver) else: driver.find_element_by_css_selector(".Ar.Au div").send_keys( email_body[0]) sleep(0.5) # Send Button driver.find_element_by_css_selector(".T-I.J-J5-Ji.aoO.T-I-atl.L3").click() sleep(0.5) print("Email Sent to " + email)
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): """Loads image search query, fetching a set number of thumbnails. Args: query (str): the search term max_links_to_fetch (int): number of thumbnails to save wd (webdriver): webdriver sleep_between_interactions (int, optional): delay between images, Defaults to 1. """ def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnails thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail for the big image try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_time: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_time) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) # set of image urls initialized to empty image_urls = set() image_count = 0 results_start = 0 while image_count < int(max_links_to_fetch): scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try clicking every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_time) except Exception as e: continue # extract image urls actual_images = wd.find_elements_by_css_selector("img.n3VNCb") for actual_image in actual_images: if actual_image.get_attribute( "src") and "http" in actual_image.get_attribute("src"): image_urls.add(actual_image.get_attribute("src")) image_count = len(image_urls) if len(image_urls) >= int(max_links_to_fetch): print(f"Found: {len(image_urls)} image links, done!") break else: print( f"Found: {len(image_urls)} image links, looking for more ..." ) # time.sleep(30) load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def test_publish_collaborator(driver: selenium.webdriver, *args, ** kwargs): """ Test that a project in Gigantum can be published, shared with a collaborator, and imported by the collaborator. Args: driver """ r = testutils.prep_py3_minimal_base(driver) username, project_title = r.username, r.project_name # Publish project, then wait until its rebuilt logging.info(f"Publishing private project {project_title}") publish_elts = testutils.PublishProjectElements(driver) publish_elts.publish_project_button.wait().click() time.sleep(1) publish_elts.publish_confirm_button.wait().click() time.sleep(5) wait = WebDriverWait(driver, 15) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped"))) time.sleep(5) # Add collaborator logging.info(f"Adding a collaborator to private project {project_title}") publish_elts.collaborators_button.click() time.sleep(2) username2 = testutils.load_credentials(user_index=1)[0].rstrip() publish_elts.collaborators_input.send_keys(username2) publish_elts.add_collaborators_button.click() time.sleep(2) publish_elts.close_collaborators_button.click() testutils.log_out(driver) # Collaborator checks that the project is in the cloud tab and that the project imports successfully logging.info(f"Logging in as {username2}") testutils.log_in(driver, user_index=1) time.sleep(2) try: testutils.GuideElements.remove_guide(driver) except: pass time.sleep(2) publish_elts.cloud_tab.click() time.sleep(2) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".RemoteLabbooks__panel-title"))) # Test that shared cloud project is in cloud tab cloud_tab_first_project_title_delete = driver.find_element_by_css_selector( ".RemoteLabbooks__panel-title:first-child span span").text assert cloud_tab_first_project_title_delete == project_title, \ f"Expected shared cloud project {project_title} in cloud tab" publish_elts.import_first_cloud_project_button.click() time.sleep(2) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped"))) # Test that after import, the shared project opens to overview page shared_project_title = publish_elts.owner_title assert project_title in shared_project_title, \ f"After import, expected shared project {project_title} to open to overview page" testutils.log_out(driver) # Delete cloud project logging.info(f"Logging in as {username}") testutils.log_in(driver) time.sleep(2) try: testutils.GuideElements.remove_guide(driver) except: pass time.sleep(2) testutils.delete_project_cloud(driver, project_title) # Assert project does not exist remotely (Via GraphQL). # TODO - Put back in check for the UI in addition to this check. remote_projects = graphql.list_remote_projects() assert (username, project_title) not in remote_projects # Check that the actual Git repo in the project had the remote removed successfully # Note! Use Git 2.20+ logging.info("Testing git remotes to check if set...") project_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username, 'labbooks', project_title) git_get_remote_command_2 = Popen(['git', 'remote', 'get-url', 'origin'], cwd=project_path, stdout=PIPE, stderr=PIPE) del_stderr = git_get_remote_command_2.stderr.readline().decode('utf-8').strip() assert "fatal" in del_stderr, f"Expected to not see a remote set for {project_title}, but got {del_stderr}"
def set_min_popularity(self, driver: webdriver, n: int) -> None: input_elem = driver.find_element_by_css_selector(constants.SEL_MIN_POP) input_elem.clear() input_elem.send_keys(str(n))
def getImgURL(query: str, numURL: int, wd: webdriver): wd.get(config.searchURL.format(q=query)) imgURLs = set() imageCount = 0 resultStart = 0 if (numURL < 0): print("Error please enter a valid integer") while imageCount < numURL: scroll(wd) #get list of image thumbnails thumbnailList = wd.find_elements_by_css_selector("img.Q4LuWd") numThumbnail = len(thumbnailList) print( f"Found: {numThumbnail} search results. Extracting links from {resultStart}:{numThumbnail}" ) for thumbnail in thumbnailList[resultStart:numThumbnail]: try: #click thumbnail to get image behind it thumbnail.click() #print("click") time.sleep(config.timeToSleep) except Exception: print(f"Error in clicking thumbnail") continue #get image URLs images = wd.find_elements_by_css_selector('img.n3VNCb') for image in images: #check if the image has attriubte 'src' and check that it also has 'http' if image.get_attribute( 'src') and 'http' in image.get_attribute('src'): imgURLs.add(image.get_attribute('src')) #print(imgURLs) #check duplicates imgURLs = checkDuplicates(query, imgURLs) imageCount = len(imgURLs) if len(imgURLs) >= numURL: print(f"Done! Found: {imageCount} image links") urlLogs(query, imgURLs) break #else, continue (for loop goes again) #if there are not enough images in the page, load more images (only hapens with huge amounts of images, > 50) #or if you run the program enough times with the same query else: print("Found:", len(imgURLs), "image links, looking for", numURL - len(imgURLs), "more...") time.sleep(5) return loadMoreButton = wd.find_element_by_css_selector(".mye4qd") if loadMoreButton: wd.execute_script("document.querySelector('.mye4qd').click();" ) #click load more button resultStart = len(thumbnailList) return imgURLs
def set_lookback_period(self, driver: webdriver, val: str) -> None: select = Select( driver.find_element_by_css_selector(constants.SEL_SELECT)) select.select_by_value(val)
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1.5) # build the google query search_url = "http://www.google.com/search?q={q}&tbm=isch" # load the page wd.get(search_url.format(q=query)) image_urls = [] image_count = 0 results_start = 0 last_height=0 while image_count < max_links_to_fetch: # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): image_urls.append(actual_image.get_attribute('src')) image_count +=1 if image_count == max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") return image_urls scroll_to_end(wd) new_height = wd.execute_script("return document.body.scrollHeight") if new_height == last_height: try: load_more_button = wd.find_element_by_css_selector(".mye4qd") time.sleep(2) wd.execute_script("document.querySelector('.mye4qd').click();") print("Loading more images..") except: print("You arrived at the end of the page...") return image_urls else: last_height = new_height # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_and_download(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() images_base64 = set() image_count = 0 results_start = 0 output_folder = os.path.join(OUTPUT_FOLDER_IMAGES, query) filepath_links = os.path.join(OUTPUT_FOLDER_LINKS, query) while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue try: # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: src = actual_image.get_attribute('src') # print(len(src)) if src and 'http' in src: image_urls.add(src) download_image(output_folder, src) # elif 'data:' in src: # images_base64.add(src) # download_base64(output_folder, src) except: print('error to find img.n3VNCb') with open(filepath_links, 'a') as f: for link in image_urls: f.write("%s\n" % link) image_count = len(image_urls) + len(images_base64) print('{p:2.2f}%'.format(p=100 * image_count / max_links_to_fetch)) sys.stdout.write("\033[F") # Cursor up one line if image_count >= max_links_to_fetch: print(f"Found: {image_count} image, done!") return image_urls, images_base64 has_more = wd.find_element_by_css_selector('.YstHxe') if (has_more and not 'none' in has_more.get_attribute('style')): load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") time.sleep(10) else: try: see_more = wd.find_element_by_css_selector('span.r0zKGf') if see_more: print('dont has more') break except: continue # move the result startpoint further down results_start = len(thumbnail_results) print(f"Found: {image_count} image, done!") print('{p:2.2f}%'.format(p=100 * image_count / max_links_to_fetch)) return image_urls, images_base64
def find_next_page_button(driver: webdriver): return driver.find_element_by_css_selector("a[id='f1-j_idt125_right']")
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 for _ in range(4): for __ in range(10): # multiple scrolls needed to show all 400 images wd.execute_script("window.scrollBy(0, 1000000)") time.sleep(0.2) # to load next 400 images time.sleep(1) try: wd.find_element_by_xpath( "//input[@value='Show more results']").click() except Exception as e: print("Less images found: {}".format(e)) pass while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def test_publish_sync_delete_project(driver: selenium.webdriver, *args, **kwargs): """ Test that a project in Gigantum can be published, synced, and deleted. Args: driver """ r = testutils.prep_py3_minimal_base(driver) username, project_title = r.username, r.project_name # Publish project logging.info(f"Publishing private project {project_title}") publish_elts = testutils.PublishProjectElements(driver) publish_elts.publish_project_button.wait().click() time.sleep(1) publish_elts.publish_confirm_button.wait().click() time.sleep(5) wait = WebDriverWait(driver, 15) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped"))) time.sleep(5) # Navigate to cloud tab logging.info(f"Navigating to {username}'s' cloud view") driver.get(f'{os.environ["GIGANTUM_HOST"]}/projects/cloud') sel = 'div[data-selenium-id="RemoteLabbookPanel"]:first-child' wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, sel))) time.sleep(2) ssel = f'{sel} span' cloud_tab_first_project_title_publish = driver.find_element_by_css_selector(ssel).text logging.info(f"!!!!! {cloud_tab_first_project_title_publish}") assert cloud_tab_first_project_title_publish == project_title, \ f"Expected {project_title} to be the first project in the cloud tab" logging.info("Testing git remotes to check if set...") project_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username, 'labbooks', project_title) git_get_remote_command_1 = Popen(['git', 'remote', 'get-url', 'origin'], cwd=project_path, stdout=PIPE, stderr=PIPE) pub_stdout = git_get_remote_command_1.stdout.readline().decode('utf-8').strip() assert "https://" in pub_stdout, f"Expected to see a remote set for private project " \ f"{project_title}, but got {pub_stdout}" publish_elts.local_tab.click() driver.get(f'{os.environ["GIGANTUM_HOST"]}/projects/{username}/{project_title}') time.sleep(3) # Add file to input data and sync project logging.info("Adding a file to the project") with open('/tmp/sample-upload.txt', 'w') as example_file: example_file.write('Sample Text') input_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username, 'labbooks', project_title, 'input') shutil.copy(example_file.name, input_path) logging.info(f"Syncing {project_title}") publish_elts.sync_project_button.click() time.sleep(5) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped"))) sync_message = driver.find_element_by_css_selector(".Footer__message-item > p").text assert "Sync complete" in sync_message, "Expected 'Sync complete' in footer" testutils.delete_project_cloud(driver, project_title) # Assert project does not exist remotely (Via GraphQL). # TODO - Put back in check for the UI in addition to this check. remote_projects = graphql.list_remote_projects() assert (username, project_title) not in remote_projects # Check that the actual Git repo in the project had the remote removed successfully # Note! Use Git 2.20+ git_get_remote_command_2 = Popen(['git', 'remote', 'get-url', 'origin'], cwd=project_path, stdout=PIPE, stderr=PIPE) del_stderr = git_get_remote_command_2.stderr.readline().decode('utf-8').strip() assert "fatal" in del_stderr, f"Expected to not see a remote set for {project_title}, but got {del_stderr}"
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query # build the google query search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img' time.sleep(12) print(f'current session id fetch_image_urls: {wd.session_id}') # # open tab # current = wd.current_window_handle # wd.execute_script("window.open();") # new_tab = [tab for tab in wd.window_handles if tab != current][0] # wd.switch_to.window(new_tab) # You can use (Keys.CONTROL + 't') on other OSs # load the page time.sleep(12) wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: try: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector( "img.rg_ic") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception as e: print("the problem is, ", str(e)) continue # extract image urls actual_images = wd.find_elements_by_css_selector( 'img.irc_mi') for actual_image in actual_images: if actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(1) load_more_button = wd.find_element_by_css_selector(".ksb") if load_more_button: wd.execute_script( "document.querySelector('.ksb').click();") # move the result startpoint further down results_start = len(thumbnail_results) except Exception as we: print('image_refresh_sequence Error occurred ' + str(we)) print(traceback.format_exc()) pass # close the tab # finally: # wd.close() # # wd.switch_to.window(current) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&biw=1851&bih=981&gs_l=img" # search_url = 'https://www.google.com/search?q=%7Bq%7D&tbm=isch&safe=off&tbs=isz:l&hl=en&sa=X&ved=0CAEQpwVqFwoTCIjuj8eT_O4CFQAAAAAdAAAAABAC&biw=1851&bih=981' # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query octopus = "https://www.google.com/search?q=octopus&tbm=isch&ved=2ahUKEwi3vo-xj7vtAhXE_KwKHaIMDtUQ2-cCegQIABAA&oq=octopus&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzIECAAQQzIFCAAQsQMyBwgAELEDEEMyBwgAELEDEEMyBAgAEEMyBAgAEEMyBwgAELEDEEMyBAgAEENQ9X9YkIUBYIKIAWgAcAB4AIABWogBkQGSAQEymAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=OrvNX7e5KMT5swWimbioDQ&bih=978&biw=960" starfish = "https://www.google.com/search?q=starfish+photography&tbm=isch&ved=2ahUKEwi21-iFyrztAhWFJ6wKHXrbCsQQ2-cCegQIABAA&oq=starfish+photography&gs_lcp=CgNpbWcQAzICCAAyBggAEAUQHjIGCAAQBRAeMgYIABAFEB4yBggAEAgQHjIECAAQGDoECCMQJzoFCAAQsQNQwRJYuydghSloAHAAeACAAU2IAaoGkgECMTOYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=9X7OX_aIEYXPsAX6tqugDA&bih=978&biw=960" donuts = "https://www.google.com/search?q=donuts&tbm=isch&ved=2ahUKEwihnajekbvtAhVMja0KHSGiAQAQ2-cCegQIABAA&oq=donuts&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzICCAAyBQgAELEDMgIIADICCAAyAggAMgIIADICCAAyAggAOgQIABAeUN0NWIIbYKIgaABwAHgAgAFSiAHLA5IBATiYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=sr3NX6H8EsyatgWhxAY&bih=978&biw=960" dog = "https://www.google.com/search?q=dog&sxsrf=ALeKk02JLX80UdJuenQBzVwhZUeGrDp_NA:1607394480643&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiLhaGIq73tAhUCi6wKHfM8BiUQ_AUoAXoECC0QAw&biw=1536&bih=763&dpr=2.5" cat = "https://www.google.com/search?q=cat&tbm=isch&ved=2ahUKEwinz8aJq73tAhWQSawKHfqkDucQ2-cCegQIABAA&oq=cat&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzIHCAAQsQMQQzIECAAQQzIECAAQQzIECAAQQzIECAAQQzIHCAAQsQMQQzIECAAQQzIHCAAQsQMQQ1CsX1ibYWDvYmgAcAB4AIABigGIAeICkgEDMS4ymAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=s-TOX-epFZCTsQX6ybq4Dg&bih=763&biw=1536" horse = "https://www.google.com/search?q=horse&sxsrf=ALeKk008HVfG_gNiB7N7Wer588-honTZtQ:1607796396656&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiQ3d-ohMntAhUPna0KHRhnA2EQ_AUoAXoECBIQAw&biw=1536&bih=763" tomato = "https://www.google.com/search?q=tomato+leaves&sxsrf=ALeKk00vf98Tcz2KoaoOCw0O9HgjNiPcLg:1607797123450&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiww6eDh8ntAhUMbKwKHax4A_8Q_AUoAXoECBoQAw&biw=1536&bih=763" christmasTree = "https://www.google.com/search?q=christmas+tree&tbm=isch&hl=en&chips=q:christmas+tree,g_1:decorated:ZZjyhDQ5A3I%3D&sa=X&ved=2ahUKEwiSqtKbq8ntAhXBRawKHbSpAmwQ4lYoAXoECAEQGw&biw=1903&bih=922" waterBottle = "https://www.google.com/search?q=plastic+water+bottles&tbm=isch&chips=q:plastic+water+bottles,g_1:empty:-nsNL-_ORx4%3D&hl=en&sa=X&ved=2ahUKEwi_9Mnci8ntAhUQRawKHTYLA08Q4lYoAXoECAEQGw&biw=1519&bih=763" search_url = christmasTree # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def bolder(driver: webdriver): try: driver.find_element_by_css_selector('.aaA.eN').click() except: driver.find_element_by_css_selector('.oc .J-Z-I').click() driver.find_element_by_css_selector('.aaA.eN').click()
def get_total_postings(driver: webdriver) -> int: total = driver.find_element_by_css_selector( "span[class='badge badge-info']") return int(total.text)
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 error_clicks = 0 while (image_count < max_links_to_fetch) & ( error_clicks < 30 ): # error clicks to stop when there are no more results to show by Google Images. You can tune the number scroll_to_end(wd) print('Starting search for Images') # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:max_links_to_fetch]: # try to click every thumbnail such that we can get the real image behind it print("Total Errors till now:", error_clicks) try: print('Trying to Click the Image') img.click() time.sleep(sleep_between_interactions) print('Image Click Successful!') except Exception: error_clicks = error_clicks + 1 print('ERROR: Unable to Click the Image') if (results_start < number_results): continue else: break results_start = results_start + 1 # extract image urls print('Extracting of Image URLs') actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) print('Current Total Image Count:', image_count) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script( "document.querySelector('.mye4qd').click();") results_start = len(thumbnail_results) return image_urls
def CLR_Html(browser: webdriver, Name: str) -> str: """ a = CLR_Html(self.b, ['page_name', 'profile_message_send', 'profile_action_btn', 'profile_msg_split']) print(a) :param Name: :param browser: :return: """ res = [] all_res = [] # a = Soport_webdriver.CLR_Html(b,['mail_box_send']) for name in Name: try: all_res.append(browser.find_element_by_id(name).text) res.append('find_element_by_id') except: pass try: all_res.append(browser.find_element_by_name(name).text) res.append('find_element_by_name') except: pass try: all_res.append(browser.find_element_by_xpath(name).text) res.append('find_element_by_xpath') except: pass try: all_res.append(browser.find_element_by_link_text(name).text) res.append('find_element_by_link_text') except: pass try: all_res.append( browser.find_element_by_partial_link_text(name).text) res.append('find_element_by_partial_link_text') except: pass try: all_res.append(browser.find_element_by_tag_name(name).text) res.append('find_element_by_tag_name') except: pass try: all_res.append(browser.find_element_by_class_name(name).text) res.append('find_element_by_class_name') except: pass try: all_res.append(browser.find_element_by_css_selector(name).text) res.append('find_element_by_css_selector') except: pass io = '' for x in range(len(res)): io += '{} |-| {} |-| {}\n'.format(str(Name[x]), str(res[x]), str(all_res[x])) return io
def get_user_name(_driver: webdriver) -> str: return _driver.find_element_by_css_selector('.profile-header__name').text
def collect_info(driver: webdriver, url: str, current_search_id: int) -> Dict: ###################################### # Get the infos of a page # ###################################### driver.get(url) soup = BeautifulSoup(driver.page_source, "lxml") # cherche le subtype dans "tous les biens" et skip si pas vide # (les différents éléments des lots sont pris séparément) if get_bool_presence("h2", "text-block__title", "Tous les biens", soup): return {} ##################### # Instanciations # ##################### # Base vente_publique = get_bool_presence("h2", "text-block__title", "Vente publique", soup) rapport = get_bool_presence("th", "classified-table__header", "Immeuble de rapport", soup) bien_neuf = get_bool_presence("span", "flag-list__text", "Nouvelle construction", soup) postal_code = None city = None property_subtype = None # Général facade = None etat_batiment = None # Intérieur area = None chamber = None cuisine_equipe = None feu_ouvert = False meuble = False # Extérieur jardin = False surface_jardin = None terrasse = False surface_terrasse = None surface_terrain = None # Installations piscine = False # Urbanisme surface_constructible = None # Finances price = None ##################### # Informations # ##################### # Base postal_code = re.search(search_for_postal_code, url).group("postal_code") try: city = driver.find_element_by_css_selector( "p.classified__information--address-clickable").text.split( " — ")[-1].strip() except NoSuchElementException: # fallback try: city = driver.find_element_by_css_selector( "span.classified__information--address-row").text.split( " — ")[-1].replace("|", "").strip() except NoSuchElementException: city = url.split("?search")[0].split("/")[-3] property_subtype = driver.find_element_by_css_selector( "h1.classified__title") property_subtype = property_subtype.text if re.match(avendretext, property_subtype): property_subtype = property_subtype[:-9] accordion = soup.find_all('div', {"class": "accordion accordion--section"}) for elem in accordion: entete = elem.find("h2").text # Général if entete == "Général": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: trs = line.find_all("tr") for tr in trs: th = tr.find("th").text.strip() if th.startswith("Façades"): facade = int(tr.find("td").text.strip()) elif th.startswith("État du bâtiment"): etat_batiment = tr.find("td").text.strip() # Intérieur elif entete == "Intérieur": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: trs = line.find_all("tr") for tr in trs: th = tr.find("th").text.strip() if th.startswith("Surface habitable"): area = tr.find("td").text.split() area = int(area[0]) elif th.startswith("Chambres"): chamber = int(tr.find("td").text.strip()) elif th.startswith("Feu ouvert"): feu_ouvert = True elif th.startswith("Type de cuisine"): cuisine_equipe = tr.find("td").text.strip() elif th.startswith("Meublé"): meuble = True # Extérieur elif entete == "Extérieur": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: trs = line.find_all("tr") for tr in trs: th = tr.find("th").text.strip() if th.startswith("Surface du jardin"): surface_jardin = tr.find("td").text.split() surface_jardin = int(surface_jardin[0]) if surface_jardin > 0: jardin = True elif th.startswith("Jardin"): jardin = True elif th.startswith("Surface de la terrasse"): surface_terrasse = tr.find("td").text.split() surface_terrasse = int(surface_terrasse[0]) if surface_terrasse > 0: terrasse = True elif th.startswith("Terrasse"): terrasse = True elif th.startswith("Surface du terrain"): surface_terrain = tr.find("td").text.split() surface_terrain = int(surface_terrain[0]) # Installations elif entete == "Installations": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: trs = line.find_all("tr") for tr in trs: th = tr.find("th").text.strip() if th.startswith("Piscine"): piscine = True # Urbanisme elif entete == "Urbanisme": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: trs = line.find_all("tr") for tr in trs: th = tr.find("th").text.strip() if th.startswith("Surface constructible"): surface_constructible = tr.find("td").text.split() surface_constructible = int(surface_constructible[0]) # Finances elif entete == "Finances": lines = elem.find_all("div", {"class": "accordion__content"}) for line in lines: span = line.find_all("span", {"class": "sr-only"}) try: price = span[0].text.replace("€", "").strip() except IndexError: # fallback price = soup.find("p", { "class": "classified__price" }).find("span", { "class": "sr-only" }).text.replace("€", "").strip() data = { "Lien": url, "Prix": price, "Type de propriété": property_type[current_search_id], "Vente publique": vente_publique, "Immeuble de rapport": rapport, "Bien neuf": bien_neuf, "Code Postal": postal_code, "Ville": city, "Sous-type de propriété": property_subtype, "Nombre de façades": facade, "Etat du bâtiment": etat_batiment, "Surface habitable": area, "Nombre de chambre(s)": chamber, "Type de cuisine": cuisine_equipe, "Feu ouvert": feu_ouvert, "Meublé": meuble, "Jardin": jardin, "Surface du jardin": surface_jardin, "Terrasse": terrasse, "Surface de la terrasse": surface_terrasse, "Surface totale du terrain": surface_terrain, "Piscine": piscine, "Surface de la zone constructible": surface_constructible } return data
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(SLEEP_BEFORE_MORE) not_what_you_want_button = "" try: not_what_you_want_button = wd.find_element_by_css_selector( ".r0zKGf") except: pass # If there are no more images return. if not_what_you_want_button: print("No more images available.") return image_urls load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button and not not_what_you_want_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): """ :param query: search term to fetch urls for :param max_links_to_fetch: Maximum number of links to obtain while fetching links :param wd: Webdriver from selenium that was initialized for fetching the images urls :param sleep_between_interactions: unit of time to sleep between interactions of fetching, lower means faster (but can't see it happen) :return: returns list of image urls found """ def scroll_to_end(wd): ''' :param wd: Selenium webdriver :return: None ''' wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(1) load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") time.sleep(0.5) # move the result startpoint further down results_start = len(thumbnail_results) return image_urls