Ejemplo n.º 1
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

        # build the google query

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.irc_mi')
            for actual_image in actual_images:
                if actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(1)
            load_more_button = wd.find_element_by_css_selector(".ksb")
            if load_more_button:
                wd.execute_script("document.querySelector('.ksb').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):

    #search for query
    wd.get('https://www.google.ae/imghp?hl=en&ogbl')
    search_box = wd.find_element_by_css_selector('input.gLFyf')
    search_box.send_keys(query + ' company logo')
    search_box.submit()

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                time.sleep(sleep_between_interactions)

                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 3
0
def get_full_address(driver: webdriver) -> []:
    address_list = []
    addresses_1 = driver.find_elements_by_css_selector('.bfg-gallery-address')
    addresses_2 = driver.find_elements_by_css_selector('.bfg-gallery-address2')

    for add_1, add_2 in zip(addresses_1, addresses_2):
        address_list.append(add_1.text + add_2.text)

    return address_list
Ejemplo n.º 4
0
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

        # build the google query

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # manual google image url
    # {q} - search string we wanted to searvj
    # https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q=dog&oq=dog&gs_l=img
    # load the page - this give you the dog images
    wd.get(search_url.format(q=query)) # this function opens up the browser with the searched string

    image_urls = set() # we want the unique url and not the duplicate ones ie why set is used.
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd) # makes you browser scroll down

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") # binary format of the results
        number_results = len(thumbnail_results) # length of the results


        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click() # clicks the image
                time.sleep(sleep_between_interactions) # wait for some time so image will be loaded
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb') # after clicking fetch the image
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): # if it is holding a valid url then add
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 5
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        for img in thumbnail_results[results_start:number_results]:

            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 6
0
def fetch_image_urls(query:str,max_links_to_fetch:int,wd:webdriver,sleep_between_interaction: int =1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0,document.body.scrollHeight);")
        time.sleep(sleep_between_interaction)

    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    #load the page
    wd.get(search_url.format(q=query))

    image_urls=set()
    image_count= 0
    result_start=0

    while image_count <max_links_to_fetch:
        scroll_to_end(wd)

        #get all the images thumbnails results
        thumbnails_result= wd.find_elements_by_css_selector("img.Q4LuWd")
        #print(thumbnails_result)
        num_results= len(thumbnails_result)
        print(f"Found: {num_results}  search results. Extracting links from {result_start} :{num_results}")



        for img in thumbnails_result[result_start:num_results]:
            #try to click every thumbnail and get  the image page behind it.
            try:
                img.click()
                time.sleep(sleep_between_interaction)
            except Exception:
                continue

            #extract images urls
            actual_images=wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))
            image_count=len(image_urls)


            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links ,Done!!")
                break
        else:
            print(f"Found {len(image_urls)} images links ,Looking for more")
            time.sleep(30)
            return
            load_more_button=wd.find_element_by_css_selector('.mye4qd')
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        #move the result startpoint further down
        result_start=len(thumbnails_result)
    return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1):
    """Allows the webdriver to look for a query in Google Image and fetches a number of image links
    corresponding to the query."""

    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        _scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    if actual_image.get_attribute('src') is not None:
                        image_urls.add(actual_image.get_attribute('src'))
                    else:
                        pass
            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result start point further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 8
0
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: float = 1.0):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)



    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        logger.info("Found: %i search results. Extracting links from %i:%i" % (number_results, results_start, number_results))

        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls
            actual_images = wd.find_elements_by_css_selector("img.n3VNCb")
            for actual_image in actual_images:
                if actual_image.get_attribute("src") and "http" in actual_image.get_attribute("src"):
                    image_urls.add(actual_image.get_attribute("src"))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                logger.info("Found: %i image links, done!" % len(image_urls))
                break
        else:
            logger.info("Found: %i image links, looking for more ..." % len(image_urls))
            time.sleep(30)
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 9
0
def fetch_image_urls(search_term: str, n_links: int, web_driver: webdriver, sleep_between_interactions: int = 1):
    def scroll_to_end(web_driver):
        web_driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    search_url = f"https://www.google.com/search?tbm=isch&q={'+'.join(search_term.split())}"
    web_driver.get(search_url)

    image_counter = 0
    res_start = 0
    image_urls = set()

    while(image_counter < n_links):

        thumbnail_results = web_driver.find_elements_by_css_selector(
            'img.Q4LuWd')
        number_results = len(thumbnail_results)
        scroll_to_end(web_driver)
        print(
            f'Found {number_results} search results. Extracting links {res_start} to {number_results}')

        for img in thumbnail_results[res_start:number_results]:
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except:
                continue

            actual_image = web_driver.find_elements_by_css_selector(
                'img.n3VNCb')
            for img in actual_image:
                src = img.get_attribute('src')
                if src and 'http' in src:
                    image_urls.add(src)

            image_counter = len(image_urls)
            if image_counter >= n_links:
                print(f"Found {n_links} image links.")
                break

        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(3)
            load_more_button = web_driver.find_element_by_css_selector(
                ".mye4qd")
            if load_more_button:
                web_driver.execute_script(
                    "document.querySelector('.mye4qd').click();")

        res_start = len(thumbnail_results)

    return image_urls
def fetch_image_urls(search_keyword: str, download_number: int, wd: webdriver):
    print('------------------------------------------------')
    print('Start getting thumnails')

    fetch_thumbnail_count = 0
    thumbnails = None
    # fetch thumbnails up to number of images to download
    while fetch_thumbnail_count < download_number:
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        thumbnails = wd.find_elements_by_css_selector('img.Q4LuWd')
        fetch_thumbnail_count = len(thumbnails)

        # break if fetched thumbnails number exceed number of images to download
        if len(thumbnails) >= download_number:
            print('Success: Fetched thumbnails count', download_number)
            break
        else:
            # load more thumbnails when load_more_button appears
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # brek when end_text appears ( this is the limit of thumbnails that can be fetched )
        end_text = wd.find_element_by_class_name('OuJzKb')
        if end_text and end_text.text == 'Looks like you\'ve reached the end':
            print('Success: Fetched maximum thumbnails count', len(thumbnails))
            break

    print('Start getting image urls')
    image_urls = []
    # extract the image url from the elements displayed by clicking the thumbnails
    for thumbnail in thumbnails[:download_number]:
        try:
            thumbnail.click()
            time.sleep(1)
        except Exception:
            continue

        # extract only the original image url because there are some urls
        thumbnail_alt = thumbnail.get_attribute('alt')
        images = wd.find_elements_by_css_selector('img.n3VNCb')
        for image in images:
            image_alt = image.get_attribute('alt')
            if thumbnail_alt == image_alt and 'http' in image.get_attribute('src'):
                image_urls.append(image.get_attribute('src'))

    print('Success: Fetched image urls count', len(image_urls))
    return image_urls
    def fetch_image_urls(query: str,
                         max_links_to_fetch: int,
                         wd: webdriver,
                         sleep_between_interactions: int = 3):
        def scroll_to_end(wd, scroll_point):
            wd.execute_script(f"window.scrollTo(0, {scroll_point});")
            time.sleep(sleep_between_interactions)

        # to build out python package use input function in search_url variable
        # build the unsplash query
        search_url = f"https://www.google.com/search?q=google+images+{query}&rlz=1C1CHBF_enUS830US830&sxsrf=ALeKk03SgL8-qRAfeZd1QDzweydJ4MlDgg:1628187781073&source=lnms&tbm=isch&sa=X&ved=2ahUKEwi61sWSwJryAhXNTDABHYhPDpsQ_AUoAXoECAEQAw&biw=1536&bih=722&dpr=1.25"
        # load the page
        wd.get(search_url)
        time.sleep(sleep_between_interactions)

        image_urls = set()
        image_count = 0
        number_results = 0

        for i in range(1, 20):
            scroll_to_end(wd, i * 1000)
            time.sleep(5)
            thumb = wd.find_elements_by_css_selector("img")
            time.sleep(5)
            for img in thumb:
                print(img)
                print(img.get_attribute('src'))
                image_urls.add(img.get_attribute('src'))
                image_count = len(image_urls)
                number_results = image_count
                time.sleep(.5)
            print(
                f"Found: {number_results} search results. Extracting links...")
        return image_urls
Ejemplo n.º 12
0
def get_niche_grade(driver: webdriver, property_id: int):
    neighborhood = 'WASHINGTON'  # getNeighborhood(property_id)
    result = []
    niche_base_url = 'https://www.niche.com/places-to-live/'
    url = niche_base_url + 'washington-dc-district-of-columbia-dc'
    driver.get(url)

    grades = driver.find_elements_by_css_selector(
        '.profile-grade--two .niche__grade')
    keys = [
        'public_school', 'safety', 'jobs', 'nightlife', 'cost_of_living',
        'housing'
    ]
    values = []
    for i in grades:
        text_list = i.text.split()
        if len(text_list) > 0:
            values.append(text_list[1])

    result = {}
    for k, v in zip(keys, values):
        result[k] = [v]

    result['neighborhood'] = [neighborhood]
    df = pd.DataFrame.from_dict(result)

    return result
Ejemplo n.º 13
0
def fetch_image_unsplash(query: str,
                         max_links_to_fetch: int,
                         wd: webdriver,
                         sleep_between_interactions: int = 3):
    def scroll_to_end(wd, scroll_point):
        wd.execute_script(f"window.scrollTo(0, {scroll_point});")
        time.sleep(sleep_between_interactions)

    # build the unsplash query
    search_url = f"https://unsplash.com/s/photos/{query}"
    # load the page
    wd.get(search_url)
    time.sleep(sleep_between_interactions)

    image_urls = set()
    image_count = 0
    number_results = 0

    for i in range(1, 20):
        scroll_to_end(wd, i * 1000)
        time.sleep(5)
        thumb = wd.find_elements_by_css_selector("img._2UpQX")
        time.sleep(5)
        for img in thumb:
            image_urls.add(img.get_attribute('src'))
            image_count = len(image_urls)
            number_results = image_count
            time.sleep(.5)
        print(f"Found: {number_results} search results. Extracting links...")
    return image_urls
Ejemplo n.º 14
0
def launch_twitter(driver: webdriver):
    """
    :type driver: selenium.webdriver.firefox.webdriver.WebDriver
    """
    twitter_url = "https://twitter.com/search?f=tweets&vertical=default&q=gleam.io&src=typd&lang=en"
    driver.get(twitter_url)
    URLs = driver.find_elements_by_css_selector("li[data-item-id]")
    lurl = []

    for tweet in URLs:
        if tweet.find_elements_by_class_name("twitter-timeline-link"):
            linkr = tweet.find_element_by_class_name("twitter-timeline-link")
            text = linkr.get_attribute("href")
            if len(tweet.find_elements_by_class_name("card2")) == 0 and len(
                    text) == 0:
                if len(
                        tweet.find_elements_by_xpath(
                            ".//*[starts-with(@id,'xdm')]")) != 0:
                    frame = tweet.find_element_by_xpath(
                        ".//*[starts-with(@id,'xdm')]")
                    driver.switch_to.frame(frame)
                    link = driver.find_element_by_xpath("/html/body/div/div/a")
                    text = link.get_attribute("href")
                    driver.switch_to.default_content()
            lurl.append(text)
    return lurl
Ejemplo n.º 15
0
def get_image_urls(wd: webdriver, max_links, query):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    search_url = f"https://www.google.com/search?tbm=isch&sxsrf=ALeKk00SMu3Udk8ijCHEDJ_BC6AHQG0Leg%3A1612191933731&source=hp&biw=1920&bih=979&ei=vRgYYPiiKpm-9QO8kIzACg&q={query}&oq={query}&gs_lcp=CgNpbWcQAzIFCAAQsQMyBQgAELEDMgUIABCxAzIFCAAQsQMyBQgAELEDMgUIABCxAzIFCAAQsQMyBQgAELEDMgIIADIFCAAQsQM6CAgAELEDEIMBUNYCWO8RYNgSaABwAHgAgAGkAogB1hKSAQUwLjUuNpgBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ved=0ahUKEwi49ZH8-sjuAhUZX30KHTwIA6gQ4dUDCAY&uact=5"

    wd.get(search_url)
    image_urls = []
    count = 0
    while count < max_links:
        scroll_to_end(wd)
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        num_results = len(thumbnail_results)
        print(
            f"Found {num_results} search result. Getting source of {num_results}:..{max_links}"
        )

        for img in thumbnail_results:
            try:
                img.click()
            except:
                pass
            time.sleep(3)  #change according to resolution

            actual_images = wd.find_elements_by_css_selector("img.n3VNCb")
            for actual in actual_images:
                if "http" in actual.get_attribute(
                        'src') and "encrypted" not in actual.get_attribute(
                            'src'):
                    image_urls.append(actual.get_attribute('src'))
                    print(f"{count+1}:{actual.get_attribute('src')}")

                count = len(image_urls)
                if count >= max_links:
                    print(f"Fetched {count} urls...Downloading!")
                    return image_urls

        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

    return image_urls
Ejemplo n.º 16
0
def retrieve_image_url(search: str,
                       max_links: int,
                       wd: webdriver,
                       sleep_bw_interact: float = 1):

    #output set of image urls
    img_urls = set()

    google_img_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'
    #print('final_url_to_search- ',google_img_url.format(q=search))

    #time to get the page
    wd.get(google_img_url.format(q=search))
    img_count = 0
    start_ind = 0
    while_loop = 0
    while img_count < max_links:
        while_loop += 1
        print('count started: ', while_loop)
        thumbnails = wd.find_elements_by_css_selector('img.Q4LuWd')
        scroll_to_end(wd)
        print('thumbnails found ', len(thumbnails))
        results_found = len(thumbnails)
        print('start with: {a} and end with : {b} '.format(a=start_ind,
                                                           b=results_found))
        for i in thumbnails[start_ind:results_found]:
            try:
                i.click()
                time.sleep(sleep_bw_interact)
            except Exception as e:
                print(e, 'while doing thumbnail click')
                continue

            #after click extract image url
            actual_img = wd.find_elements_by_css_selector('img.n3VNCb')
            for j in actual_img:
                if j.get_attribute('src') and 'http' in j.get_attribute('src'):
                    img_urls.add(j.get_attribute('src'))

            img_count = len(img_urls)
            if img_count >= max_links:
                print('got enough links ', img_count)
                break
        start_ind = len(thumbnails)

    return img_urls
Ejemplo n.º 17
0
def get_price_list(driver: webdriver) -> []:
    price_list = []
    asking_prices = driver.find_elements_by_css_selector('.bfg-gallery-price')
    for i in asking_prices:
        price_str = i.text.split()[2][1:].replace(',', '')
        price_list.append(float(price_str))

    return price_list
Ejemplo n.º 18
0
def extract_info_from_iframe(browser: webdriver) -> []:
    try:
        # open all other pages with comments
        button = browser.find_element_by_css_selector(
            "button[data-dot='strankovani/nacist_dalsi']")
        while button:
            try:
                action = ActionChains(browser)
                action.move_to_element(button).click().perform()
            except (ElementClickInterceptedException,
                    StaleElementReferenceException):
                pass
            sleep(0.5 * SETTINGS["lazy_factor"])
            button = browser.find_element_by_css_selector(
                "button[data-dot='strankovani/nacist_dalsi']")
    except NoSuchElementException:
        pass

    try:
        # open threads of subcomments
        for button in browser.find_elements_by_css_selector(
                "button[data-dot='nacist_nove_podkomentare']"):
            try:
                action = ActionChains(browser)
                action.move_to_element(button).click().perform()
            except (ElementClickInterceptedException,
                    StaleElementReferenceException):
                pass
            sleep(0.3 * SETTINGS["lazy_factor"])
    except NoSuchElementException:
        pass

    # now we can use familiar beautiful soup
    soup = BeautifulSoup(browser.page_source, "html.parser")

    authors = soup.select("a[class='f_bO'] span")
    texts = soup.select("p[class='d_aJ']")
    reactions = soup.select("a[class='f_cQ']")

    comments = []
    progress_bar = tqdm(total=len(authors), desc="Comments", position=0)
    for i in range(len(authors)):
        author = authors[i].text
        text = texts[i].text

        # expect that reaction can be missing in some comments
        reactions_count = 0
        if len(reactions) > i:
            reactions_count = reactions[i].text

        comments.append(
            Comment(author=author, text=text, reactions=reactions_count))
        progress_bar.update(1)

    # browser doesn't need to be open from now on
    browser.quit()

    return comments
Ejemplo n.º 19
0
def fetch_image_urls(query: str,
                     max_links_to_fetch: int,
                     wd: webdriver,
                     sleep_between_interactions: int = 1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    search_url = "https://www.google.com/search?q={q}&source=lnms&tbm=isch"
    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        for _ in range(10):
            scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        for img in thumbnail_results[results_start:number_results]:

            if img.get_attribute('src') and 'http' in img.get_attribute('src'):
                image_urls.add(img.get_attribute('src'))

            if img.get_attribute('src') and 'data' in img.get_attribute('src'):
                image_urls.add(img.get_attribute('src'))

            image_count = len(image_urls)

        if len(image_urls) >= max_links_to_fetch:
            print(f"Found: {len(image_urls)} image links, done!")
            break
        else:
            print("Found:", len(image_urls),
                  "image links, looking for more ...")
            # return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
                time.sleep(3)

            # end_of_page = wd.find_element_by_xpath("//div[@class='OuJzKb Yu2Dnd']")
            end_of_page = wd.find_elements_by_xpath(
                "//*[ contains (text(), 'Looks like') ]")
            if end_of_page:
                print("end of the page")
                break

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls
Ejemplo n.º 20
0
    def get_documents(self, browser: webdriver, url: str):
        browser.get(url)

        try:
            WebDriverWait(browser, DELAY).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.headlineContaniner > a')))
        except TimeoutException:
            return []

        try:
            links = [link.get_attribute('href') for link in browser.find_elements_by_css_selector('div.headlineContaniner > a')]
        except StaleElementReferenceException:
            time.sleep(2)
            links = [link.get_attribute('href') for link in browser.find_elements_by_css_selector('div.headlineContaniner > a')]

        documents = []
        for link in links:
            documents.append(self.get_document_details(browser, link))
            
        return documents
Ejemplo n.º 21
0
def get_price(driver: webdriver) -> Tuple[datetime, str, Set[str]]:
    driver.get(URL)

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located(
            (By.LINK_TEXT, "Book")  # Button like 'Book' or 'Confirm'
        ))

    # CSS selectors for currency and price elements on page
    currency = driver.find_elements_by_css_selector("p.price_info span")
    prices = driver.find_elements_by_css_selector("p.price_info span.num")

    timestamp = datetime.now()

    driver.get_screenshot_as_file(
        f"{SCREENSHOT_PATH}/ticket-prices_{timestamp.strftime('%d-%m-%Y_%H-%M-%S')}.png"
    )

    return timestamp, currency[0].text, {i.text for i in prices}
Ejemplo n.º 22
0
def scrape_followers(
    driver: webdriver,
    username: str,
    cookies: List[Dict[str,
                       Any]] = None) -> Tuple[str, str, Set[str], Set[str]]:
    # CSS Selector for followers and following lists
    list_css: str = "div[role='dialog'] a.notranslate"

    if cookies:
        # Load any page before setting cookies
        driver.get("https://www.instagram.com/data/manifest.json")
        for cookie in cookies:
            driver.add_cookie(cookie)

    # Load account page
    driver.get(f"https://www.instagram.com/{username}/")

    num_followers: str = driver.find_element_by_css_selector(
        "a[href*='followers'] span").text
    num_following: str = driver.find_element_by_css_selector(
        "a[href*='following'] span").text

    # Click the 'Followers' link
    driver.find_element_by_partial_link_text("followers").click()
    WebDriverWait(driver, 10).until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css)))
    # TODO: Scrolling Magic here
    _followers: List = driver.find_elements_by_css_selector(list_css)
    followers: Set[str] = {i.text for i in _followers}

    driver.find_element_by_css_selector(
        "div[role='dialog'] button span[aria-label='Close']").click()

    # Click the 'Following' link
    driver.find_element_by_partial_link_text("following").click()
    WebDriverWait(driver, 10).until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css)))
    # TODO: Scrolling Magic here
    _following: List = driver.find_elements_by_css_selector(list_css)
    following: Set[str] = {i.text for i in _following}

    return (num_followers, num_following, followers, following)
def get_img(driver: webdriver) -> List[str]:
    try:
        driver.implicitly_wait(5)
        imgs: List[str] = driver.find_elements_by_css_selector(
            'div.smallImg>ul>li>a>img')
        src: List[str] = []
        for img in imgs:
            src.append(img.get_attribute('src').replace('_S', '_L'))
        return src
    except Exception as e:
        print('Can\'t get Images SRC List. Reason %s.' % e)
Ejemplo n.º 24
0
 def get_img(self, driver: webdriver) -> List[str]:
     try:
         driver.implicitly_wait(5)
         imgs: List[str] = driver.find_elements_by_css_selector(
             'body > div.page-popup.exhibited-vehicle > div.clfix > div.vehicle-photo-wrap > div.vehicle-thumbnail > ul > li > a > img'
         )
         src: List[str] = []
         for img in imgs:
             src.append(img.get_attribute('src'))
         return src
     except Exception as e:
         print('Can\'t get Images SRC List. Reason %s.' % e)
Ejemplo n.º 25
0
 def find_sizes(self, driver: webdriver) -> List[Size]:
     # pylint: disable=missing-function-docstring
     sizes = driver.find_elements_by_css_selector(
         '.size-list .product-size')
     obj_sizes = []
     for size in sizes:
         deque_size_types = collections.deque(
             size.get_attribute('data-name').split(' ('), 2)
         obj_size = Size(deque_size_types.popleft(),
                         self.clean_nubmer_size(deque_size_types.popleft()),
                         "disabled" in size.get_attribute("class"))
         obj_sizes.append(obj_size)
     return obj_sizes
Ejemplo n.º 26
0
def fetch_image_urls_from_google(query: str, wd: webdriver, sleep_between_interactions: int = 1):
    """
    Fetches all the urls of images found on the first result page for received search query.

    :param query:                         query to search for
    :param wd:                            selenium web driver
    :param sleep_between_interactions:    time for browser to load photos
    :return:                              set of found urls
    """

    # Google search - large images
    search_url = "https://www.google.com/search?q={q}&tbm=isch&hl=en-US&hl=en-US&tbs=isz%3Al&client=ubuntu&hs=hdu&ved" \
                 "=0CAEQpwVqFwoTCKDZh9KqmOgCFQAAAAAdAAAAABAD&biw=1908&bih=955 "
    image_urls = set()

    # load the page
    wd.get(search_url.format(q=query))
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(sleep_between_interactions)
    # get all image thumbnail results
    thumbnails = wd.find_elements_by_css_selector("img.Q4LuWd")

    for thumbnail in thumbnails:
        try:
            # get big image from the thumbnail
            thumbnail.click()
            time.sleep(sleep_between_interactions)
        except Exception:
            continue

        # extract image urls
        images = wd.find_elements_by_css_selector('img.n3VNCb')
        for img in images:
            if img.get_attribute('src') and 'http' in img.get_attribute('src'):
                image_urls.add(img.get_attribute('src'))

    print(f"Found: {len(image_urls)} image links for the search query: {query}")

    return image_urls
Ejemplo n.º 27
0
def get_neighborhood_state_zip(driver: webdriver) -> []:
    state_list = []
    zipcode_list = []
    neighborhood_list = []
    addresses_2 = driver.find_elements_by_css_selector('.bfg-gallery-address2')

    for t in addresses_2:
        t_list = t.text.split()
        neighborhood_list.append(t_list[0][:-1])
        state_list.append(t_list[1])
        zipcode_list.append(int(t_list[2]))

    return neighborhood_list, state_list, zipcode_list
Ejemplo n.º 28
0
def launch_twitter(driver: webdriver):
    """
    :type driver: selenium.webdriver.firefox.webdriver.WebDriver
    """
    twitter_url = "https://twitter.com/search?f=tweets&vertical=default&q=discord.gg&src=unkn"
    driver.get(twitter_url)
    URLs = driver.find_elements_by_css_selector("li[data-item-id]")
    lurl = []

    for tweet in URLs:
        if tweet.find_elements_by_class_name("twitter-timeline-link"):
            linkr = tweet.find_element_by_class_name("twitter-timeline-link")
            text = linkr.get_attribute("href")
            lurl.append(text)
    return lurl
def get_img_str(driver: webdriver) -> str:
    try:
        driver.implicitly_wait(5)
        imgs: List[str] = driver.find_elements_by_css_selector(
            'div.smallImg>ul>li>a>img')
        img_str: str = ''
        for img in imgs:
            src_str = img.get_attribute('src')
            str_arr = str(src_str).split('/')
            str_arr.reverse()
            img_str += str_arr[0].replace(
                '_S',
                '_L') + '[:param:][alt=' + driver.find_element_by_css_selector(
                    'h1.vehicle-Tit'
                ).text + '][title=' + driver.find_element_by_css_selector(
                    'h1.vehicle-Tit').text + ']|'
        return img_str[:-1]
    except Exception as e:
        print('Can\'t get Images str. Reason %s' % e)
Ejemplo n.º 30
0
    def retrieve_child_elements(self, web_driver, parent_element: im_webdriver,
                                target_identity: str, identity_type: int):
        """
        find child elements
        
        :args:
         - parent_element: parent element used to find child elements
         - target_identity: partial url of child elements
         - identity_type: type of identity
           0: href link
           1: class name

        :return:
         - child elements list
        """

        if identity_type == 0:
            child_elements = parent_element.find_elements_by_css_selector(
                'div[class="div_t"]>a')
        return child_elements