def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                # wd.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'})
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 2
0
def fetch_image_face(query: str,
                     max_links_to_fetch: int,
                     target_path: str,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_paths = []
    image_with_face_count = 0
    results_start = 0
    while image_with_face_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    url = actual_image.get_attribute('src')
                    filepath, ok = persist_image(target_path, url)
                    if not ok:
                        continue
                    if faceDetect.isFace(filepath):
                        image_urls.add(url)
                        image_paths.append(filepath)
                    else:
                        os.unlink(filepath)

            image_with_face_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_paths
Ejemplo n.º 3
0
    def navigate_to_next_week(self, sportjaDriver: webdriver):
        sportjaDriver.find_element_by_css_selector(
            "#head > a:nth-child(5)").click()
        sleep(1)

        return sportjaDriver
Ejemplo n.º 4
0
def send_to_field_with(css_sel_or_xpath: str, keys: str, _driver: webdriver):
    login_field = _driver.find_element_by_css_selector(f'{css_sel_or_xpath}')
    login_field.clear()
    login_field.send_keys(keys)
    return login_field
Ejemplo n.º 5
0
def send_single_mail(driver: webdriver, email: str, email_subject: str,
                     email_body: list) -> None:
    # Compose button
    try:
        driver.find_element_by_css_selector('.z0>.L3').click()
    except IndexError:
        driver.find_element_by_css_selector('.z0>.L3::before').click()
    sleep(1)

    # Input Recipient
    driver.find_element_by_css_selector(".oj div textarea").send_keys(email)
    sleep(0.5)

    # Input Subject
    driver.find_element_by_css_selector(".aoD.az6 input").send_keys(
        email_subject)
    sleep(0.5)

    # Input Text
    if (len(email_body) > 1):
        for i in range(len(email_body)):
            driver.find_element_by_css_selector(".Ar.Au div").send_keys(
                email_body[i])
            bolder(driver)
    else:
        driver.find_element_by_css_selector(".Ar.Au div").send_keys(
            email_body[0])
    sleep(0.5)

    # Send Button
    driver.find_element_by_css_selector(".T-I.J-J5-Ji.aoO.T-I-atl.L3").click()
    sleep(0.5)

    print("Email Sent to " + email)
Ejemplo n.º 6
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    """Loads image search query, fetching a set number of thumbnails.

    Args:
        query (str): the search term
        max_links_to_fetch (int): number of thumbnails to save
        wd (webdriver): webdriver
        sleep_between_interactions (int, optional): delay between images, Defaults to 1.
    """
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnails
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail for the big image
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 7
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_time: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_time)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    # set of image urls initialized to empty
    image_urls = set()
    image_count = 0
    results_start = 0

    while image_count < int(max_links_to_fetch):

        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try clicking every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_time)
            except Exception as e:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector("img.n3VNCb")

            for actual_image in actual_images:
                if actual_image.get_attribute(
                        "src") and "http" in actual_image.get_attribute("src"):
                    image_urls.add(actual_image.get_attribute("src"))

            image_count = len(image_urls)

            if len(image_urls) >= int(max_links_to_fetch):
                print(f"Found: {len(image_urls)} image links, done!")
                break

            else:
                print(
                    f"Found: {len(image_urls)} image links, looking for more ..."
                )
                # time.sleep(30)

        load_more_button = wd.find_element_by_css_selector(".mye4qd")
        if load_more_button:
            wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
def test_publish_collaborator(driver: selenium.webdriver, *args, ** kwargs):
    """
        Test that a project in Gigantum can be published, shared with a collaborator, and imported by the collaborator.

        Args:
            driver
    """
    r = testutils.prep_py3_minimal_base(driver)
    username, project_title = r.username, r.project_name

    # Publish project, then wait until its rebuilt
    logging.info(f"Publishing private project {project_title}")
    publish_elts = testutils.PublishProjectElements(driver)
    publish_elts.publish_project_button.wait().click()
    time.sleep(1)
    publish_elts.publish_confirm_button.wait().click()
    time.sleep(5)
    wait = WebDriverWait(driver, 15)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped")))
    time.sleep(5)

    # Add collaborator
    logging.info(f"Adding a collaborator to private project {project_title}")
    publish_elts.collaborators_button.click()
    time.sleep(2)
    username2 = testutils.load_credentials(user_index=1)[0].rstrip()
    publish_elts.collaborators_input.send_keys(username2)
    publish_elts.add_collaborators_button.click()
    time.sleep(2)
    publish_elts.close_collaborators_button.click()
    testutils.log_out(driver)

    # Collaborator checks that the project is in the cloud tab and that the project imports successfully
    logging.info(f"Logging in as {username2}")
    testutils.log_in(driver, user_index=1)
    time.sleep(2)
    try:
        testutils.GuideElements.remove_guide(driver)
    except:
        pass
    time.sleep(2)
    publish_elts.cloud_tab.click()
    time.sleep(2)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".RemoteLabbooks__panel-title")))

    # Test that shared cloud project is in cloud tab
    cloud_tab_first_project_title_delete = driver.find_element_by_css_selector(
        ".RemoteLabbooks__panel-title:first-child span span").text
    assert cloud_tab_first_project_title_delete == project_title, \
        f"Expected shared cloud project {project_title} in cloud tab"

    publish_elts.import_first_cloud_project_button.click()
    time.sleep(2)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped")))

    # Test that after import, the shared project opens to overview page
    shared_project_title = publish_elts.owner_title
    assert project_title in shared_project_title, \
        f"After import, expected shared project {project_title} to open to overview page"

    testutils.log_out(driver)

    # Delete cloud project
    logging.info(f"Logging in as {username}")
    testutils.log_in(driver)
    time.sleep(2)
    try:
        testutils.GuideElements.remove_guide(driver)
    except:
        pass
    time.sleep(2)
    testutils.delete_project_cloud(driver, project_title)

    # Assert project does not exist remotely (Via GraphQL).
    # TODO - Put back in check for the UI in addition to this check.
    remote_projects = graphql.list_remote_projects()
    assert (username, project_title) not in remote_projects

    # Check that the actual Git repo in the project had the remote removed successfully
    # Note! Use Git 2.20+
    logging.info("Testing git remotes to check if set...")
    project_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username,
                                'labbooks', project_title)
    git_get_remote_command_2 = Popen(['git', 'remote', 'get-url', 'origin'],
                                     cwd=project_path, stdout=PIPE, stderr=PIPE)
    del_stderr = git_get_remote_command_2.stderr.readline().decode('utf-8').strip()

    assert "fatal" in del_stderr, f"Expected to not see a remote set for {project_title}, but got {del_stderr}"
Ejemplo n.º 9
0
 def set_min_popularity(self, driver: webdriver, n: int) -> None:
     input_elem = driver.find_element_by_css_selector(constants.SEL_MIN_POP)
     input_elem.clear()
     input_elem.send_keys(str(n))
Ejemplo n.º 10
0
def getImgURL(query: str, numURL: int, wd: webdriver):
    wd.get(config.searchURL.format(q=query))

    imgURLs = set()
    imageCount = 0
    resultStart = 0

    if (numURL < 0):
        print("Error please enter a valid integer")

    while imageCount < numURL:
        scroll(wd)

        #get list of image thumbnails
        thumbnailList = wd.find_elements_by_css_selector("img.Q4LuWd")
        numThumbnail = len(thumbnailList)
        print(
            f"Found: {numThumbnail} search results. Extracting links from {resultStart}:{numThumbnail}"
        )

        for thumbnail in thumbnailList[resultStart:numThumbnail]:
            try:
                #click thumbnail to get image behind it
                thumbnail.click()
                #print("click")
                time.sleep(config.timeToSleep)
            except Exception:
                print(f"Error in clicking thumbnail")
                continue

            #get image URLs
            images = wd.find_elements_by_css_selector('img.n3VNCb')
            for image in images:
                #check if the image has attriubte 'src' and check that it also has 'http'
                if image.get_attribute(
                        'src') and 'http' in image.get_attribute('src'):
                    imgURLs.add(image.get_attribute('src'))

            #print(imgURLs)
            #check duplicates
            imgURLs = checkDuplicates(query, imgURLs)
            imageCount = len(imgURLs)

            if len(imgURLs) >= numURL:
                print(f"Done! Found: {imageCount} image links")
                urlLogs(query, imgURLs)
                break

            #else, continue (for loop goes again)

        #if there are not enough images in the page, load more images (only hapens with huge amounts of images, > 50)
        #or if you run the program enough times with the same query
        else:
            print("Found:", len(imgURLs), "image links, looking for",
                  numURL - len(imgURLs), "more...")
            time.sleep(5)
            return
            loadMoreButton = wd.find_element_by_css_selector(".mye4qd")
            if loadMoreButton:
                wd.execute_script("document.querySelector('.mye4qd').click();"
                                  )  #click load more button

            resultStart = len(thumbnailList)

    return imgURLs
Ejemplo n.º 11
0
 def set_lookback_period(self, driver: webdriver, val: str) -> None:
     select = Select(
         driver.find_element_by_css_selector(constants.SEL_SELECT))
     select.select_by_value(val)
Ejemplo n.º 12
0
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int):
    
    
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)    
        
    # build the google query
    search_url = "http://www.google.com/search?q={q}&tbm=isch"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = []
    image_count = 0
    results_start = 0
    last_height=0
    
    while image_count < max_links_to_fetch:
        
        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.append(actual_image.get_attribute('src'))              
                    image_count +=1     
                    if image_count == max_links_to_fetch:
                        print(f"Found: {len(image_urls)} image links, done!")
                        return image_urls 
         
        scroll_to_end(wd)
        new_height = wd.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            try:
                    load_more_button = wd.find_element_by_css_selector(".mye4qd")
                    time.sleep(2)
                    wd.execute_script("document.querySelector('.mye4qd').click();")
                    print("Loading more images..")
            
            except:
                    print("You arrived at the end of the page...")
                    return image_urls 
        else:
       
            last_height = new_height
            

        # move the result startpoint further down
        results_start = len(thumbnail_results)


    return image_urls
Ejemplo n.º 13
0
def fetch_image_and_download(query: str,
                             max_links_to_fetch: int,
                             wd: webdriver,
                             sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    images_base64 = set()
    image_count = 0
    results_start = 0
    output_folder = os.path.join(OUTPUT_FOLDER_IMAGES, query)
    filepath_links = os.path.join(OUTPUT_FOLDER_LINKS, query)

    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            try:
                # extract image urls
                actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
                for actual_image in actual_images:
                    src = actual_image.get_attribute('src')
                    # print(len(src))
                    if src and 'http' in src:
                        image_urls.add(src)
                        download_image(output_folder, src)
                    # elif 'data:' in src:
                    #     images_base64.add(src)
                    #     download_base64(output_folder, src)
            except:
                print('error to find img.n3VNCb')

            with open(filepath_links, 'a') as f:
                for link in image_urls:
                    f.write("%s\n" % link)

            image_count = len(image_urls) + len(images_base64)
            print('{p:2.2f}%'.format(p=100 * image_count / max_links_to_fetch))
            sys.stdout.write("\033[F")  # Cursor up one line

            if image_count >= max_links_to_fetch:
                print(f"Found: {image_count} image, done!")
                return image_urls, images_base64

        has_more = wd.find_element_by_css_selector('.YstHxe')
        if (has_more and not 'none' in has_more.get_attribute('style')):
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
                time.sleep(10)
        else:
            try:
                see_more = wd.find_element_by_css_selector('span.r0zKGf')
                if see_more:
                    print('dont has more')
                    break
            except:
                continue

        # move the result startpoint further down
        results_start = len(thumbnail_results)
    print(f"Found: {image_count} image, done!")
    print('{p:2.2f}%'.format(p=100 * image_count / max_links_to_fetch))
    return image_urls, images_base64
Ejemplo n.º 14
0
def find_next_page_button(driver: webdriver):
    return driver.find_element_by_css_selector("a[id='f1-j_idt125_right']")
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0

    for _ in range(4):
        for __ in range(10):
            # multiple scrolls needed to show all 400 images
            wd.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        # to load next 400 images
        time.sleep(1)
        try:
            wd.find_element_by_xpath(
                "//input[@value='Show more results']").click()
        except Exception as e:
            print("Less images found: {}".format(e))
            pass

    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
def test_publish_sync_delete_project(driver: selenium.webdriver, *args, **kwargs):
    """
        Test that a project in Gigantum can be published, synced, and deleted.

        Args:
            driver
    """
    r = testutils.prep_py3_minimal_base(driver)
    username, project_title = r.username, r.project_name
    # Publish project
    logging.info(f"Publishing private project {project_title}")
    publish_elts = testutils.PublishProjectElements(driver)
    publish_elts.publish_project_button.wait().click()
    time.sleep(1)
    publish_elts.publish_confirm_button.wait().click()
    time.sleep(5)
    wait = WebDriverWait(driver, 15)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped")))
    time.sleep(5)

    # Navigate to cloud tab
    logging.info(f"Navigating to {username}'s' cloud view")
    driver.get(f'{os.environ["GIGANTUM_HOST"]}/projects/cloud')

    sel = 'div[data-selenium-id="RemoteLabbookPanel"]:first-child'
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, sel)))
    time.sleep(2)


    ssel = f'{sel} span'
    cloud_tab_first_project_title_publish = driver.find_element_by_css_selector(ssel).text
    logging.info(f"!!!!! {cloud_tab_first_project_title_publish}")

    assert cloud_tab_first_project_title_publish == project_title, \
        f"Expected {project_title} to be the first project in the cloud tab"


    logging.info("Testing git remotes to check if set...")
    project_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username,
                                'labbooks', project_title)
    git_get_remote_command_1 = Popen(['git', 'remote', 'get-url', 'origin'],
                                     cwd=project_path, stdout=PIPE, stderr=PIPE)
    pub_stdout = git_get_remote_command_1.stdout.readline().decode('utf-8').strip()
    assert "https://" in pub_stdout, f"Expected to see a remote set for private project " \
                                     f"{project_title}, but got {pub_stdout}"

    publish_elts.local_tab.click()
    driver.get(f'{os.environ["GIGANTUM_HOST"]}/projects/{username}/{project_title}')
    time.sleep(3)

    # Add file to input data and sync project
    logging.info("Adding a file to the project")
    with open('/tmp/sample-upload.txt', 'w') as example_file:
        example_file.write('Sample Text')
    input_path = os.path.join(os.environ['GIGANTUM_HOME'], username, username, 'labbooks', project_title,
                              'input')
    shutil.copy(example_file.name, input_path)
    logging.info(f"Syncing {project_title}")
    publish_elts.sync_project_button.click()
    time.sleep(5)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".flex>.Stopped")))

    sync_message = driver.find_element_by_css_selector(".Footer__message-item > p").text
    assert "Sync complete" in sync_message, "Expected 'Sync complete' in footer"

    testutils.delete_project_cloud(driver, project_title)

    # Assert project does not exist remotely (Via GraphQL).
    # TODO - Put back in check for the UI in addition to this check.
    remote_projects = graphql.list_remote_projects()
    assert (username, project_title) not in remote_projects

    # Check that the actual Git repo in the project had the remote removed successfully
    # Note! Use Git 2.20+
    git_get_remote_command_2 = Popen(['git', 'remote', 'get-url', 'origin'],
                                     cwd=project_path, stdout=PIPE, stderr=PIPE)
    del_stderr = git_get_remote_command_2.stderr.readline().decode('utf-8').strip()

    assert "fatal" in del_stderr, f"Expected to not see a remote set for {project_title}, but got {del_stderr}"
Ejemplo n.º 17
0
    def fetch_image_urls(query: str,
                         max_links_to_fetch: int,
                         wd: webdriver,
                         sleep_between_interactions: int = 1):
        def scroll_to_end(wd):
            wd.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(sleep_between_interactions)

            # build the google query

        # build the google query
        search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'

        time.sleep(12)
        print(f'current session id fetch_image_urls: {wd.session_id}')

        # # open tab
        # current = wd.current_window_handle
        # wd.execute_script("window.open();")
        # new_tab = [tab for tab in wd.window_handles if tab != current][0]
        # wd.switch_to.window(new_tab)
        # You can use (Keys.CONTROL + 't') on other OSs
        # load the page
        time.sleep(12)
        wd.get(search_url.format(q=query))

        image_urls = set()
        image_count = 0
        results_start = 0
        while image_count < max_links_to_fetch:
            try:
                scroll_to_end(wd)

                # get all image thumbnail results
                thumbnail_results = wd.find_elements_by_css_selector(
                    "img.rg_ic")
                number_results = len(thumbnail_results)

                print(
                    f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
                )

                for img in thumbnail_results[results_start:number_results]:
                    # try to click every thumbnail such that we can get the real image behind it
                    try:
                        img.click()
                        time.sleep(sleep_between_interactions)
                    except Exception as e:
                        print("the problem is, ", str(e))
                        continue

                    # extract image urls
                    actual_images = wd.find_elements_by_css_selector(
                        'img.irc_mi')
                    for actual_image in actual_images:
                        if actual_image.get_attribute('src'):
                            image_urls.add(actual_image.get_attribute('src'))

                    image_count = len(image_urls)

                    if len(image_urls) >= max_links_to_fetch:
                        print(f"Found: {len(image_urls)} image links, done!")
                        break
                else:
                    print("Found:", len(image_urls),
                          "image links, looking for more ...")
                    time.sleep(1)
                    load_more_button = wd.find_element_by_css_selector(".ksb")
                    if load_more_button:
                        wd.execute_script(
                            "document.querySelector('.ksb').click();")

                # move the result startpoint further down
                results_start = len(thumbnail_results)

            except Exception as we:
                print('image_refresh_sequence Error occurred ' + str(we))
                print(traceback.format_exc())
                pass

            # close the tab
            # finally:
            #     wd.close()
            #
            #     wd.switch_to.window(current)

        return image_urls
Ejemplo n.º 18
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&biw=1851&bih=981&gs_l=img"
    # search_url = 'https://www.google.com/search?q=%7Bq%7D&tbm=isch&safe=off&tbs=isz:l&hl=en&sa=X&ved=0CAEQpwVqFwoTCIjuj8eT_O4CFQAAAAAdAAAAABAC&biw=1851&bih=981'

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 19
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    octopus = "https://www.google.com/search?q=octopus&tbm=isch&ved=2ahUKEwi3vo-xj7vtAhXE_KwKHaIMDtUQ2-cCegQIABAA&oq=octopus&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzIECAAQQzIFCAAQsQMyBwgAELEDEEMyBwgAELEDEEMyBAgAEEMyBAgAEEMyBwgAELEDEEMyBAgAEENQ9X9YkIUBYIKIAWgAcAB4AIABWogBkQGSAQEymAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=OrvNX7e5KMT5swWimbioDQ&bih=978&biw=960"
    starfish = "https://www.google.com/search?q=starfish+photography&tbm=isch&ved=2ahUKEwi21-iFyrztAhWFJ6wKHXrbCsQQ2-cCegQIABAA&oq=starfish+photography&gs_lcp=CgNpbWcQAzICCAAyBggAEAUQHjIGCAAQBRAeMgYIABAFEB4yBggAEAgQHjIECAAQGDoECCMQJzoFCAAQsQNQwRJYuydghSloAHAAeACAAU2IAaoGkgECMTOYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=9X7OX_aIEYXPsAX6tqugDA&bih=978&biw=960"
    donuts = "https://www.google.com/search?q=donuts&tbm=isch&ved=2ahUKEwihnajekbvtAhVMja0KHSGiAQAQ2-cCegQIABAA&oq=donuts&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzICCAAyBQgAELEDMgIIADICCAAyAggAMgIIADICCAAyAggAOgQIABAeUN0NWIIbYKIgaABwAHgAgAFSiAHLA5IBATiYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=sr3NX6H8EsyatgWhxAY&bih=978&biw=960"
    dog = "https://www.google.com/search?q=dog&sxsrf=ALeKk02JLX80UdJuenQBzVwhZUeGrDp_NA:1607394480643&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiLhaGIq73tAhUCi6wKHfM8BiUQ_AUoAXoECC0QAw&biw=1536&bih=763&dpr=2.5"
    cat = "https://www.google.com/search?q=cat&tbm=isch&ved=2ahUKEwinz8aJq73tAhWQSawKHfqkDucQ2-cCegQIABAA&oq=cat&gs_lcp=CgNpbWcQAzIECCMQJzIECCMQJzIHCAAQsQMQQzIECAAQQzIECAAQQzIECAAQQzIECAAQQzIHCAAQsQMQQzIECAAQQzIHCAAQsQMQQ1CsX1ibYWDvYmgAcAB4AIABigGIAeICkgEDMS4ymAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=s-TOX-epFZCTsQX6ybq4Dg&bih=763&biw=1536"
    horse = "https://www.google.com/search?q=horse&sxsrf=ALeKk008HVfG_gNiB7N7Wer588-honTZtQ:1607796396656&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiQ3d-ohMntAhUPna0KHRhnA2EQ_AUoAXoECBIQAw&biw=1536&bih=763"
    tomato = "https://www.google.com/search?q=tomato+leaves&sxsrf=ALeKk00vf98Tcz2KoaoOCw0O9HgjNiPcLg:1607797123450&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiww6eDh8ntAhUMbKwKHax4A_8Q_AUoAXoECBoQAw&biw=1536&bih=763"
    christmasTree = "https://www.google.com/search?q=christmas+tree&tbm=isch&hl=en&chips=q:christmas+tree,g_1:decorated:ZZjyhDQ5A3I%3D&sa=X&ved=2ahUKEwiSqtKbq8ntAhXBRawKHbSpAmwQ4lYoAXoECAEQGw&biw=1903&bih=922"
    waterBottle = "https://www.google.com/search?q=plastic+water+bottles&tbm=isch&chips=q:plastic+water+bottles,g_1:empty:-nsNL-_ORx4%3D&hl=en&sa=X&ved=2ahUKEwi_9Mnci8ntAhUQRawKHTYLA08Q4lYoAXoECAEQGw&biw=1519&bih=763"
    search_url = christmasTree

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 20
0
def bolder(driver: webdriver):
    try:
        driver.find_element_by_css_selector('.aaA.eN').click()
    except:
        driver.find_element_by_css_selector('.oc .J-Z-I').click()
        driver.find_element_by_css_selector('.aaA.eN').click()
Ejemplo n.º 21
0
def get_total_postings(driver: webdriver) -> int:
    total = driver.find_element_by_css_selector(
        "span[class='badge badge-info']")
    return int(total.text)
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    error_clicks = 0
    while (image_count < max_links_to_fetch) & (
            error_clicks < 30
    ):  # error clicks to stop when there are no more results to show by Google Images. You can tune the number
        scroll_to_end(wd)

        print('Starting search for Images')

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )
        for img in thumbnail_results[results_start:max_links_to_fetch]:
            # try to click every thumbnail such that we can get the real image behind it
            print("Total Errors till now:", error_clicks)
            try:
                print('Trying to Click the Image')
                img.click()
                time.sleep(sleep_between_interactions)
                print('Image Click Successful!')
            except Exception:
                error_clicks = error_clicks + 1
                print('ERROR: Unable to Click the Image')
                if (results_start < number_results):
                    continue
                else:
                    break

            results_start = results_start + 1

            # extract image urls
            print('Extracting of Image URLs')
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            print('Current Total Image Count:', image_count)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
            else:
                load_more_button = wd.find_element_by_css_selector(".mye4qd")
                if load_more_button:
                    wd.execute_script(
                        "document.querySelector('.mye4qd').click();")

        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 23
0
def CLR_Html(browser: webdriver, Name: str) -> str:
    """
    a = CLR_Html(self.b, ['page_name', 'profile_message_send', 'profile_action_btn', 'profile_msg_split'])
    print(a)
    :param Name:
    :param browser:
    :return:
    """

    res = []
    all_res = []
    # a = Soport_webdriver.CLR_Html(b,['mail_box_send'])
    for name in Name:

        try:
            all_res.append(browser.find_element_by_id(name).text)
            res.append('find_element_by_id')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_name(name).text)
            res.append('find_element_by_name')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_xpath(name).text)
            res.append('find_element_by_xpath')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_link_text(name).text)
            res.append('find_element_by_link_text')
        except:
            pass

        try:
            all_res.append(
                browser.find_element_by_partial_link_text(name).text)
            res.append('find_element_by_partial_link_text')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_tag_name(name).text)
            res.append('find_element_by_tag_name')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_class_name(name).text)
            res.append('find_element_by_class_name')
        except:
            pass

        try:
            all_res.append(browser.find_element_by_css_selector(name).text)
            res.append('find_element_by_css_selector')
        except:
            pass

    io = ''
    for x in range(len(res)):
        io += '{} |-| {} |-| {}\n'.format(str(Name[x]), str(res[x]),
                                          str(all_res[x]))
    return io
Ejemplo n.º 24
0
def get_user_name(_driver: webdriver) -> str:
    return _driver.find_element_by_css_selector('.profile-header__name').text
Ejemplo n.º 25
0
def collect_info(driver: webdriver, url: str, current_search_id: int) -> Dict:
    ######################################
    #    Get the infos of a page         #
    ######################################
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, "lxml")

    # cherche le subtype dans "tous les biens" et skip si pas vide
    # (les différents éléments des lots sont pris séparément)
    if get_bool_presence("h2", "text-block__title", "Tous les biens", soup):
        return {}

    #####################
    #   Instanciations  #
    #####################

    # Base
    vente_publique = get_bool_presence("h2", "text-block__title",
                                       "Vente publique", soup)
    rapport = get_bool_presence("th", "classified-table__header",
                                "Immeuble de rapport", soup)
    bien_neuf = get_bool_presence("span", "flag-list__text",
                                  "Nouvelle construction", soup)
    postal_code = None
    city = None
    property_subtype = None

    # Général
    facade = None
    etat_batiment = None

    # Intérieur
    area = None
    chamber = None
    cuisine_equipe = None
    feu_ouvert = False
    meuble = False

    # Extérieur
    jardin = False
    surface_jardin = None
    terrasse = False
    surface_terrasse = None
    surface_terrain = None

    # Installations
    piscine = False

    # Urbanisme
    surface_constructible = None

    # Finances
    price = None

    #####################
    #   Informations    #
    #####################

    # Base
    postal_code = re.search(search_for_postal_code, url).group("postal_code")
    try:
        city = driver.find_element_by_css_selector(
            "p.classified__information--address-clickable").text.split(
                " — ")[-1].strip()
    except NoSuchElementException:
        # fallback
        try:
            city = driver.find_element_by_css_selector(
                "span.classified__information--address-row").text.split(
                    " — ")[-1].replace("|", "").strip()
        except NoSuchElementException:
            city = url.split("?search")[0].split("/")[-3]
    property_subtype = driver.find_element_by_css_selector(
        "h1.classified__title")
    property_subtype = property_subtype.text

    if re.match(avendretext, property_subtype):
        property_subtype = property_subtype[:-9]

    accordion = soup.find_all('div', {"class": "accordion accordion--section"})

    for elem in accordion:
        entete = elem.find("h2").text

        # Général
        if entete == "Général":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                trs = line.find_all("tr")
                for tr in trs:
                    th = tr.find("th").text.strip()

                    if th.startswith("Façades"):
                        facade = int(tr.find("td").text.strip())

                    elif th.startswith("État du bâtiment"):
                        etat_batiment = tr.find("td").text.strip()

        # Intérieur
        elif entete == "Intérieur":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                trs = line.find_all("tr")
                for tr in trs:
                    th = tr.find("th").text.strip()

                    if th.startswith("Surface habitable"):
                        area = tr.find("td").text.split()
                        area = int(area[0])

                    elif th.startswith("Chambres"):
                        chamber = int(tr.find("td").text.strip())

                    elif th.startswith("Feu ouvert"):
                        feu_ouvert = True

                    elif th.startswith("Type de cuisine"):
                        cuisine_equipe = tr.find("td").text.strip()

                    elif th.startswith("Meublé"):
                        meuble = True

        # Extérieur
        elif entete == "Extérieur":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                trs = line.find_all("tr")
                for tr in trs:
                    th = tr.find("th").text.strip()

                    if th.startswith("Surface du jardin"):
                        surface_jardin = tr.find("td").text.split()
                        surface_jardin = int(surface_jardin[0])
                        if surface_jardin > 0:
                            jardin = True
                    elif th.startswith("Jardin"):
                        jardin = True

                    elif th.startswith("Surface de la terrasse"):
                        surface_terrasse = tr.find("td").text.split()
                        surface_terrasse = int(surface_terrasse[0])
                        if surface_terrasse > 0:
                            terrasse = True
                    elif th.startswith("Terrasse"):
                        terrasse = True

                    elif th.startswith("Surface du terrain"):
                        surface_terrain = tr.find("td").text.split()
                        surface_terrain = int(surface_terrain[0])

        # Installations
        elif entete == "Installations":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                trs = line.find_all("tr")
                for tr in trs:
                    th = tr.find("th").text.strip()

                    if th.startswith("Piscine"):
                        piscine = True

        # Urbanisme
        elif entete == "Urbanisme":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                trs = line.find_all("tr")
                for tr in trs:
                    th = tr.find("th").text.strip()

                    if th.startswith("Surface constructible"):
                        surface_constructible = tr.find("td").text.split()
                        surface_constructible = int(surface_constructible[0])

        # Finances
        elif entete == "Finances":
            lines = elem.find_all("div", {"class": "accordion__content"})
            for line in lines:
                span = line.find_all("span", {"class": "sr-only"})
                try:
                    price = span[0].text.replace("€", "").strip()
                except IndexError:
                    # fallback
                    price = soup.find("p", {
                        "class": "classified__price"
                    }).find("span", {
                        "class": "sr-only"
                    }).text.replace("€", "").strip()

    data = {
        "Lien": url,
        "Prix": price,
        "Type de propriété": property_type[current_search_id],
        "Vente publique": vente_publique,
        "Immeuble de rapport": rapport,
        "Bien neuf": bien_neuf,
        "Code Postal": postal_code,
        "Ville": city,
        "Sous-type de propriété": property_subtype,
        "Nombre de façades": facade,
        "Etat du bâtiment": etat_batiment,
        "Surface habitable": area,
        "Nombre de chambre(s)": chamber,
        "Type de cuisine": cuisine_equipe,
        "Feu ouvert": feu_ouvert,
        "Meublé": meuble,
        "Jardin": jardin,
        "Surface du jardin": surface_jardin,
        "Terrasse": terrasse,
        "Surface de la terrasse": surface_terrasse,
        "Surface totale du terrain": surface_terrain,
        "Piscine": piscine,
        "Surface de la zone constructible": surface_constructible
    }
    return data
Ejemplo n.º 26
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(SLEEP_BEFORE_MORE)

            not_what_you_want_button = ""
            try:
                not_what_you_want_button = wd.find_element_by_css_selector(
                    ".r0zKGf")
            except:
                pass

            # If there are no more images return.
            if not_what_you_want_button:
                print("No more images available.")
                return image_urls

            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button and not not_what_you_want_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 27
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    """
    :param query: search term to fetch urls for
    :param max_links_to_fetch: Maximum number of links to obtain while fetching links
    :param wd: Webdriver from selenium that was initialized for fetching the images urls
    :param sleep_between_interactions: unit of time to sleep between interactions of fetching, lower means faster (but can't see it happen)
    :return: returns list of image urls found
    """
    def scroll_to_end(wd):
        '''
        :param wd: Selenium webdriver
        :return: None
        '''
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)
        # build the google query

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(1)
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
            time.sleep(0.5)

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls