Example #1
0
def get_url(webdriver):

    global requests_url

    if webdriver == None:
        return None
    else:
        get_elem_img = webdriver.find_elements_by_tag_name('img')
        get_img = [
            get_url_img.get_attribute('src') for get_url_img in get_elem_img
        ]

        get_elem_script = webdriver.find_elements_by_tag_name('script')
        get_src = [
            get_url_src.get_attribute('src') for get_url_src in get_elem_script
        ]

        get_elem_link = webdriver.find_elements_by_tag_name('link')
        get_link = [
            get_link_src.get_attribute('href')
            for get_link_src in get_elem_link
        ]

        get_elem_href = webdriver.find_elements_by_tag_name('a')
        get_href = [
            get_url_href.get_attribute('href')
            for get_url_href in get_elem_href
        ]

        get_img.extend(get_link)
        get_src.extend(get_img)
        requests_url.extend(get_src)

        return get_href
Example #2
0
def search_trulia(neighborhood_code, webdriver):
    all_buttons = webdriver.find_elements_by_tag_name('button')    
    buy_button = None
    rent_button = None
    sold_button = None
    for b in all_buttons:
        if b.text == 'Buy':
            buy_button = b
        if b.text == 'Rent':
            rent_button = b
        if b.text == 'Sold':
            sold_button = b

    buy_button.click()

    input_search_vectors = [
        "webdriver.find_element_by_id('homepageSearchBoxTextInput')",
        "webdriver.find_element_by_xpath(\"//input[@type='text'][@data-testid='location-search-input']\")"
    ]

    search_input = None
    for isv in input_search_vectors:
        try:
            search_input = eval(isv)    
            if search_input:
                break
        except Exception as err:
            print('### Received %s finding search input element.' % err.__class__.__name__)
            #print(err)
            print('### Retrying...')

    if not search_input:
        print('Could not find search-input element in page.')
        return

    search_input.send_keys('%s, Brooklyn, NY' % BK_NEIGHBORHOOD_CODES[neighborhood_code])

    BUTTON_SEARCH_VECTORS = [
        "webdriver.find_element_by_css_selector('button.addon.btn.btnDefault.homepageSearchButton')",
        "webdriver.find_element_by_xpath(\"//div[@data-testid='location-search-button'][@class='LocationAutosuggestInput__RightIcon-sc-789ttj-3 jtFvQn']\")",
        "webdriver.find_element_by_xpath(\"//button[@data-auto-test-id='searchButton']\")"
    ]

    search_button = None

    for bsv in BUTTON_SEARCH_VECTORS:
        try:
            search_button = eval(bsv)
            if search_button:
                break
        except Exception as err:
            print('### Received %s finding search button.' % err.__class__.__name__)
            #print(err)
            print('### Retrying...')

    if not search_button:
        print('Could not find search button element.')
        return

    search_button.click()
Example #3
0
def get_picture_links(webdriver, total_posts):
        '''This function will find all links that have
        pictures on the page. Note that there are only about 12 photos per page
        so the program will 'scroll' as many times as necessary to get all photos.

        Parameters: the active webdriver, the total posts for that user

        Returns: a set of picture and video links
        '''
        posts = total_posts.split(' ')[0].replace(',', '')
        num_posts = int(posts)   
        pages = (num_posts//12) #This will determine how many times we scroll on a page
                                
        
        link_set = set()
        html = webdriver.find_element_by_tag_name('html')

        for _ in range(pages):
                all_links = webdriver.find_elements_by_tag_name('a')   
                for link in all_links:
                        picture_link = link.get_attribute('href') 
                        if picture_link.find('/p/') != -1: #only adding links that have are pictures
                                link_set.add(picture_link)
                        
                html.send_keys(Keys.END)
                time.sleep(3)

        return link_set
Example #4
0
def totals(webdriver):
        '''This function will get the total number of posts and followers
        for the user profile page that the webdriver is currently on. 

        Parameters: the active webdriver 

        Returns: tuple of the total posts and total followers.'''

        list_elements = webdriver.find_elements_by_tag_name('li')

        post_element = [x for x in list_elements if x.text.find('posts') != -1] 
        total_posts = post_element[0].text

        followers_element = [x for x in list_elements if x.text.find('followers') != -1]
        total_followers = followers_element[0].text
def find_all(webdriver, by, css_selector_val):
    '''
    Wrapper function of selenium python to find list of elments using same locator and locator_value(css_selector_val)

    Arguments
    ---------

    webdriver       -   object of selenium.webdriver.chrome.webdriver.WebDriver .
    by              -   element locator name .
                        contraint:
                            expected value:-    name, xpath, link_text, partial_link_text, 
                                                tag_name, class_name, css_selector 
                        other value than the expected will return None
    css_selector_val-   value for the element locator i.e. arg 'by' 
                        example:- to find all elements with class_name=contact, value for css_selector_val is 'contact'
    
    Return
    ---------

    Webelement list -   if the value of arg 'by' is an expected value
                    or
    None            -   if the value of arg 'by' is an unexpected value
    '''

    if by == 'name':
        return webdriver.find_elements_by_name(css_selector_val)
    if by == 'xpath':
        return webdriver.find_elements_by_xpath(css_selector_val)
    if by == 'link_text':
        return webdriver.find_elements_by_link_text(css_selector_val)
    if by == 'partial_link_text':
        return webdriver.find_elements_by_partial_link_text(css_selector_val)
    if by == 'tag_name':
        return webdriver.find_elements_by_tag_name(css_selector_val)
    if by == 'class_name':
        return webdriver.find_elements_by_class_name(css_selector_val)
    if by == 'css_selector':
        return webdriver.find_elements_by_css_selector(css_selector_val)
    else:
        return None
Example #6
0
    executable_path=r'Enter_your_path_for_geckodriver.exe',
    firefox_profile=firefox_profile)
sleep(3)

# Go To Login Page
webdriver.get('https://www.instagram.com/accounts/login/')
sleep(3)

# User Credentials
username = webdriver.find_element_by_name('username')
username.send_keys('enter_your_username')
password = webdriver.find_element_by_name('password')
password.send_keys('enter_your_password')

# Click 'not now' buttons
buttons = webdriver.find_elements_by_tag_name('button')
buttons[2].click()
sleep(3)

buttons = webdriver.find_elements_by_tag_name('button')
buttons[1].click()
sleep(3)

notnow = webdriver.find_element_by_class_name("aOOlW")
notnow.click()
sleep(3)

# Desired Hashtags
hashtag_list = ['travel', 'summer', 'design']
tag = -1
Example #7
0
passwordInput.send_keys(password)
passwordInput.send_keys(Keys.ENTER)
time.sleep(2)

browser.get('https://www.instagram.com/include._')
followersLink = browser.find_element_by_css_selector('ul li a')
followersLink.click()
time.sleep(2)
followersList = browser.find_element_by_css_selector('div[role=\'dialog\'] ul')
numberOfFollowersInList = len(
    followersList.find_elements_by_css_selector('li'))

followersList.click()
actionChain = webdriver.ActionChains(browser)
ActionChains(webdriver).move_to_element(
    webdriver.find_elements_by_tag_name('button')).click().perform()
while (numberOfFollowersInList < 1500):
    actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
    numberOfFollowersInList = len(
        followersList.find_elements_by_css_selector('li'))

followers = []
for user in followersList.find_elements_by_css_selector('li'):
    userLink = user.find_element_by_css_selector('a').get_attribute('href')
    print(userLink)
    followers.append(userLink)
    if (len(followers) == 1500):
        break
    else:
        print("lst line")
Example #8
0
def fill_forms(email_producer, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket, failfile, furl):
    """Finds a newsletter form on the page. If not found, visits <num_links>
    internal links and scans those pages for a form. Submits the form if found.
    """
    # skipping: load the site
    # skipping: connecting to logger

    # try to find a newsletter form on the landing page
    if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                           browser_params, manager_params):
        return

    # otherwise, scan more pages
    print("couldn't find form, going to click around")
    main_handle = webdriver.current_window_handle
    visited_links = set()
    for i in range(num_links):
        # get all links on the page
        links = webdriver.find_elements_by_tag_name('a')
        random.shuffle(links)

        current_url = webdriver.current_url
        current_ps1 = domain_utils.get_ps_plus_1(current_url)

        # find links to click
        match_links = []
        start_time = timeit.default_timer()
        for link in links:
            try:
                if not link.is_displayed():
                    continue

                # check if link is valid and not already visited
                href = link.get_attribute('href')
                if href is None or href in visited_links:
                    continue

                # check if this is an internal link
                if not _is_internal_link(href, current_url, current_ps1):
                    continue

                link_text = link.text.lower()

                # skip links with blacklisted text
                blacklisted = False
                for bl_text in _LINK_TEXT_BLACKLIST:
                    if bl_text in link_text:
                        blacklisted = True
                        break
                if blacklisted:
                    continue

                # should we click this link?
                link_rank = 0
                for type, s, rank, flags in _LINK_TEXT_RANK:
                    if (type == _TYPE_TEXT
                            and s in link_text) or (type == _TYPE_HREF
                                                    and s in href):
                        if flags & _FLAG_IN_NEW_URL_ONLY:
                            # don't use this link if the current page URL already matches too
                            if type == _TYPE_HREF and s in current_url:
                                continue

                        # link matches!
                        link_rank = rank
                        match_links.append(
                            (link, rank, link_text, href, flags))
                        break
                if link_rank >= _LINK_RANK_SKIP:  # good enough, stop looking
                    break
            except:
                print("ERROR while looping through links...")
                sys.exit(1)

            # quit if too much time passed (for some reason, this is really slow...)
            if match_links and timeit.default_timer(
            ) - start_time > _LINK_MATCH_TIMEOUT:
                break

        # find the best link to click
        if not match_links:
            break  # no more links to click
        match_links.sort(key=lambda l: l[1])
        next_link = match_links[-1]
        visited_links.add(next_link[3])

        # click the link
        try:
            # load the page
            print("clicking on link '%s' - %s" % (next_link[2], next_link[3]))
            next_link[0].click()
            time.sleep(_PAGE_LOAD_TIME)
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)
            # if browser_params['bot_mitigation']:
            #     bot_mitigation(webdriver)

            # find newsletter form
            if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                                   browser_params, manager_params):
                return

            # should we stay on this page?
            if next_link[4] & _FLAG_STAY_ON_PAGE:
                continue

            # go back
            webdriver.back()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

            # check other windows (ex. pop-ups)
            windows = webdriver.window_handles
            if len(windows) > 1:
                form_found_in_popup = False
                for window in windows:
                    if window != main_handle:
                        webdriver.switch_to_window(window)
                        wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

                        # find newsletter form
                        if _find_and_fill_form(webdriver, email_producer,
                                               visit_id, debug, browser_params,
                                               manager_params):
                            form_found_in_popup = True

                        webdriver.close()
                webdriver.switch_to_window(main_handle)
                time.sleep(1)

                if form_found_in_popup:
                    return
        except:
            pass

    # if you reach here, signup wasn't successful -- save the information
    with open(failfile, 'a') as wh:
        wh.write(furl + '\n')
Example #9
0
def _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                        browser_params, manager_params):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    # try to find newsletter form on landing page
    newsletter_form = _find_newsletter_form(webdriver)
    if newsletter_form is None:
        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name('iframe')
        for iframe in iframes:
            # switch to the iframe
            webdriver.switch_to_frame(iframe)

            # is there a form?
            newsletter_form = _find_newsletter_form(webdriver)
            if newsletter_form is not None:
                if debug:
                    dump_page_source(debug_page_source_initial, webdriver,
                                     _SRC_DUMP_PATH)
                in_iframe = True
                break  # form found, stay on the iframe

            # switch back
            webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, _SRC_DUMP_PATH)

    email = email_producer(current_url, current_site_title)
    user_info = _get_user_info(email)
    _form_fill_and_submit(newsletter_form, user_info, webdriver, False,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    print('submitted form on [%s] with email [%s]' % (current_url, email))
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)
    # if debug: save_screenshot(debug_form_post_initial, webdriver, browser_params, manager_params)

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(webdriver)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_initial,
                                             webdriver, _SRC_DUMP_PATH)
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)
                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        follow_up_form = _find_newsletter_form(webdriver)
        if follow_up_form is not None:
            if debug:
                dump_page_source(debug_page_source_initial, webdriver,
                                 _SRC_DUMP_PATH)
            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)
            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            # if debug: save_screenshot(debug_form_post_followup, webdriver, browser_params, manager_params)

# switch back
    if in_iframe:
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
Example #10
0
def _find_newsletter_form(webdriver):
    """Tries to find a form element on the page for newsletter sign-up.
    Returns None if no form was found.
    """
    # find all forms that match
    newsletter_forms = []
    forms = webdriver.find_elements_by_tag_name('form')
    for form in forms:
        if not form.is_displayed():
            continue

        # find email keywords in the form HTML (preliminary filtering)
        form_html = form.get_attribute('outerHTML').lower()
        match = False
        for s in _KEYWORDS_EMAIL:
            if s in form_html:
                match = True
                break
        if not match:
            continue

        # check if an input field contains an email element
        input_fields = form.find_elements_by_tag_name('input')
        match = False
        for input_field in input_fields:
            if input_field.is_displayed() and _is_email_input(input_field):
                match = True
                break
        if not match:
            continue

        # form matched, get some other ranking criteria:
        # - rank modal/pop-up/dialogs higher, since these are likely to be sign-up forms
        z_index = _get_z_index(form, webdriver)
        has_modal_text = 'modal' in form_html or 'dialog' in form_html
        # - rank login dialogs lower, in case better forms exist
        #   (count occurrences of these keywords, since they might just be in a URL)
        login_text_count = -sum(
            [form_html.count(s) for s in ['login', 'log in', 'sign in']])
        # - rank forms with more input elements higher
        input_field_count = len([x for x in input_fields if x.is_displayed()])
        newsletter_forms.append((form, (z_index, int(has_modal_text),
                                        login_text_count, input_field_count)))

    # return highest ranked form
    if newsletter_forms:
        newsletter_forms.sort(key=lambda x: x[1], reverse=True)
        return newsletter_forms[0][0]

    # try to find any container with email input fields and a submit button
    input_fields = webdriver.find_elements_by_tag_name('input')
    visited_containers = set()
    for input_field in input_fields:
        if not input_field.is_displayed() or not _is_email_input(input_field):
            continue

        # email input field found, check parents for container with a submit button
        try:
            e = input_field
            for i in range(_FORM_CONTAINER_SEARCH_LIMIT):
                e = e.find_element_by_xpath('..')  # get parent
                if e is None or e.id in visited_containers:
                    continue  # already visited

                # is this a container type? (<div> or <span>)
                tag_name = e.tag_name.lower()
                if tag_name == 'div' or tag_name == 'span':
                    # does this contain a submit button?
                    if _has_submit_button(e):
                        return e  # yes, we're done

                visited_containers.add(e.id)
        except:
            pass

    # still no matches?
    return None
Example #11
0
# 2.2 -------常用18种元素定位------
# 定位单个元素,8种
webdriver.find_element_by_id("")
webdriver.find_element_by_name("")
webdriver.find_element_by_class_name("")
webdriver.find_element_by_tag_name("")
webdriver.find_element_by_link_text("")
webdriver.find_element_by_partial_link_text("")
webdriver.find_element_by_css_selector("")
webdriver.find_element_by_xpath("")
# 定位一组元素,返回对象列表  8种
webdriver.find_elements_by_id("")  # id复数定位
webdriver.find_elements_by_name("")  # name复数定位
webdriver.find_elements_by_class_name("")  # class复数定位
webdriver.find_elements_by_tag_name("")  # teg复数定位
webdriver.find_elements_by_link_text("")  # link复数定位
webdriver.find_elements_by_partial_link_text("")  # partial_link复数定位
webdriver.find_elements_by_css_selector("")  # css_selector 复数定位
webdriver.find_elements_by_xpath("")  # xpath复数定位
# 这两种是参数化的方法,对上面各8种的总结
webdriver.find_element(by='id', value="")
webdriver.find_elements(by='id', value="")
'''
1、xpath语法:
指明标签 //*或者 //input[@id="kw"]
根据@属性定位,id name class 或其他属性
逻辑运算and or not,用的最多的是and,同时满足两个属性,//*[@id="kw" and @name="aa"]
层级定位/,
索引定位,从1开始,input[1],多个相同标签,用索引定位
模糊匹配:标签对之间的文本信息的模糊匹配//*[contains(text(),"hao123)]