Ejemplo n.º 1
0
def getFormItems(driver: webdriver):
    tags = []

    inputs = driver.find_elements_by_tag_name('input')
    for element in inputs:
        tag = {}
        tag['label'] = getLabel(element)
        tag['id'] = element.get_attribute('id')
        tag['name'] = element.get_attribute('name')
        tag['type'] = element.get_attribute('type')
        tag['value'] = element.get_attribute('value')
        location = element.location
        tag['location_x'] = location['x']
        tag['location_y'] = location['y']
        tags.append(tag)

    selects = driver.find_elements_by_tag_name('select')
    for element in selects:
        tag = {}
        tag['label'] = getLabel(element)
        tag['id'] = element.get_attribute('id')
        tag['name'] = element.get_attribute('name')
        tag['type'] = 'select'
        tag['value'] = getSelectOptions(element)
        location = element.location
        tag['location_x'] = location['x']
        tag['location_y'] = location['y']
        tags.append(tag)
    return tags
Ejemplo n.º 2
0
def get_image_links(driver: webdriver, site: str) -> Links:

    # On a side note, I am genuinely surprised how they managed to obfuscate the images on each of their sites.
    # On KissComics, they have all the links in some script located in the page, this script holds the image links
    # On KissManga, they have img tags that somehow only load the image links once the page is loaded in a browser
    # viewing the raw HTML leads to the images somehow not being there, I don't know enough WebDev to say how they do it
    image_links = []
    if site == 'comics':
        regex = re.compile('lstImages.push\\("(.*?)"')

        for a in driver.find_elements_by_tag_name('script'):
            img_set = re.findall(regex, a.get_attribute('innerHTML'))
            if not img_set == []:
                image_links.append(img_set)

    elif site == 'manga':
        elements = driver.find_elements_by_xpath(
            '//img[@onerror="onErrorImg(this)"]')

        for elem in elements:
            src = elem.get_attribute('src')
            image_links.append(src)

    print(image_links)
    return np.array(image_links).flatten().tolist()
Ejemplo n.º 3
0
def execSearch(browser: webdriver, inifile):

    # 検索ワード抽出
    browser.get(_get(inifile, 'search', 'url_trend'))
    contents = browser.find_elements_by_tag_name('h1')
    searchWords = []
    for content in contents:
        searchWords.append(content.text)
    searchWords.pop(0)
    searchWords.pop(0)

    # ログインボタンの押下(検索、メールDEポイント共通)
    browser.get(_get(inifile, 'search', 'url') + '/Web?qt=' + searchWords[0])
    searchWords.pop(0)
    browser.find_element_by_link_text(_get(inifile, 'search',
                                           'login_context')).click()

    login_user = browser.find_element_by_name(_get(inifile, 'user', 'id_name'))
    login_user.send_keys(_get(inifile, 'user', 'id'))
    login_password = browser.find_element_by_name(
        _get(inifile, 'user', 'pass_name'))
    login_password.send_keys(_get(inifile, 'user', 'pass'))
    browser.find_element_by_name("submit").click()
    sleep(_getRandomNum())

    ## 検索ワードを一つずつ処理
    for index, searchWord in enumerate(searchWords):
        search_box = browser.find_element_by_name("qt")
        search_box.clear()
        search_box.send_keys(searchWord)
        browser.find_element_by_id('searchBtn').click()
        sleep(_getRandomNum())
Ejemplo n.º 4
0
def collectLinks(driver: webdriver):
    tags = []
    elements = driver.find_elements_by_tag_name('a')
    for element in elements:
        tag = {}
        tag['location'] = element.location
        tag['text'] = element.text
        tag['href'] = element.get_attribute('href')
        tags.append(tag)
    return tags
Ejemplo n.º 5
0
    def click_a_element_by_languagecode_name(self, browser: webdriver,
                                             name: str):
        """

        """
        elements = browser.find_elements_by_tag_name('a')
        for elem in elements:
            if elem.get_attribute("languagecode") == name:
                elem.click()
                break
Ejemplo n.º 6
0
def getCourseList(driver: webdriver) -> []:
    l = driver.find_elements_by_tag_name('li')
    id = []
    for i in l:
        j = i.find_element_by_tag_name('div').get_attribute("class").split(
            '-')[1]
        name = i.find_element_by_tag_name('h2').find_element_by_tag_name(
            'div').text
        print("Are you currently enrolled in", name, "(Y/N)? ", end='')
        if input().lower() == 'y':
            id.append(["https://classroom.google.com/u/0/c/" + j, name])
    return id
Ejemplo n.º 7
0
def download_images(driver: webdriver) -> []:
    images = driver.find_elements_by_tag_name('img')
    inc = 0
    images = images[1:]
    list_of_images = []
    for image in images:
        inc += 1
        # Change naming method
        name = "images/%s%d.png" % ((image.get_attribute('src'))[69:(
            len(image.get_attribute('src')) - 4)], inc)
        urllib.request.urlretrieve(image.get_attribute('src'), name)
        list_of_images.append(name)
    return list_of_images
Ejemplo n.º 8
0
def removeKataomoi(browser: webdriver, safe_accounts):
    global remain_remove_count
    """
    片思いったーにログインする
    :param browser: webdriver
    """

    # 片思いったーにアクセス
    browser.get('http://kataomoi.net/redirect.php')
    sleep(1)

    url = browser.current_url
    is_confirm = url.startswith(
        "https://api.twitter.com/oauth/authorize?oauth_token")

    if is_confirm:
        submit_btn = browser.find_element_by_id("allow")
        submit_btn.click()
        sleep(1)
    else:
        # ログイン情報の入力
        username_or_email = browser.find_element_by_xpath(
            "//*[@id='username_or_email']")
        username_or_email.send_keys(USER_NAME)
        password = browser.find_element_by_xpath("//*[@id='password']")
        password.send_keys(PASSWORD)
        # ログイン
        password.submit()
        sleep(1)

    browser.get('http://kataomoi.net/find_one_way.php')
    sleep(1)

    trs = browser.find_elements_by_tag_name("tr")

    # 古い順位に並び替え
    reversed_trs = reversed(trs)

    print('▼フォロー解除中…▼')
    for tr in reversed_trs:
        if remain_remove_count <= 0:
            break
        tds = tr.find_elements_by_tag_name("td")
        if len(tds) > 1 and not (tds[1].find_element_by_tag_name("a").text
                                 in safe_accounts):
            print(tds[1].find_element_by_tag_name("a").text)
            tr.find_elements_by_tag_name("span")[0].click()
            remain_remove_count = remain_remove_count - 1
            sleep(0.5)
Ejemplo n.º 9
0
def scrape_once(driver: webdriver, save_into_dict: bool, tagDict=[]):
    tagCount = 0
    postCount = 0
    driver.get('https://www.instagram.com/explore')
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'article')))
    all_links = driver.find_elements_by_tag_name('a')
    for link in all_links:
        href = link.get_attribute('href')
        if (href.endswith('?explore=true')):
            media_html = requests.get(href).text
            soup = BeautifulSoup(media_html, 'lxml')
            timestamp: int = time.time()
            hashtags = soup.findAll(attrs={"property": "instapp:hashtags"})
            for tagElement in hashtags:
                tag = tagElement.get('content')
                if save_into_dict:
                    tagDict[tag] += 1
                put_tag(tag, timestamp)
                tagCount += 1
            postCount += 1
    print('%d tags processed in %d posts' % (tagCount, postCount))
Ejemplo n.º 10
0
    def __duo_authenticator(driver: webdriver, duo_bypass: str):
        """Deals with the DUO Authenticator step of auth flow"""
        logger.info('Authenticating with Duo')
        iframes = driver.find_elements_by_tag_name("iframe")

        if len(iframes) > 0:
            # duo needs to be authenticated
            logger.info('Authenticating with Duo bypass code...')
            driver.switch_to.frame(iframes[0])
            duo_passcode_button = driver.find_element_by_xpath(
                "//button[@id='passcode']")
            duo_passcode_button.click()
            passcode_field = driver.find_element_by_xpath(
                "//input[@class='passcode-input']")
            passcode_field.clear()
            passcode_field.send_keys(duo_bypass)
            login_button = driver.find_element_by_xpath(
                '//button[text()="Log In"]')
            login_button.click()
            logger.info('Successfully authenticated with Duo bypass code')
            time.sleep(2)

        return
Ejemplo n.º 11
0
    def fetch_image_urls(self,
                         query: str,
                         wd: webdriver,
                         sleep_between_interactions: int = 1,
                         max_timeout=5,
                         imgs_offset=5):
        def scroll_to_end(wd):
            wd.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(sleep_between_interactions)

        # build the google query
        search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

        # load the page
        wd.get(search_url.format(q=query))

        image_urls = list()
        reference_imgs_urls = list()
        wait = WebDriverWait(wd, max_timeout)
        results_start = 0
        #wd.window_handles
        while len(image_urls) < (self.imgs2download + imgs_offset):
            scroll_to_end(wd)
            # get all image thumbnail results
            #thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic")
            wait.until(EC.presence_of_element_located((By.ID, "islrg")))
            thumbnail_div = wd.find_element_by_id('islrg')
            WebDriverWait(thumbnail_div, max_timeout).until(
                EC.presence_of_element_located((By.CLASS_NAME, "islrc")))
            thumbnail_div = thumbnail_div.find_elements_by_class_name(
                'islrc')[0]
            WebDriverWait(thumbnail_div, max_timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "div")))
            div_with_link_img = thumbnail_div.find_elements_by_tag_name('div')
            n_found_divs = len(div_with_link_img)
            for div_of_img in div_with_link_img[results_start:n_found_divs]:
                try:
                    if (len(image_urls) >= (self.imgs2download + imgs_offset)):
                        break
                    wd.switch_to_window(wd.window_handles[0])
                    possible_img_link = div_of_img.find_elements_by_tag_name(
                        'a')
                    #try:
                    for pos_link in possible_img_link:
                        possible_imgs = pos_link.find_elements_by_tag_name(
                            'img')
                        p_links = pos_link.get_attribute("href")
                        print("PL: ", p_links)
                        if (p_links != None):
                            reference_imgs_urls.append(p_links)

                        #Click on imgs in order to let the link appear
                        for img in possible_imgs:
                            w, h = int(img.get_attribute("width")), int(
                                img.get_attribute("width"))
                            if (w < 60 or h < 60):
                                possible_imgs.remove(img)
                                continue
                            else:
                                img.click()
                                #Wait until click have had effect
                                time.sleep(sleep_between_interactions)
                        if (len(possible_imgs) > 0):
                            new_img_url = pos_link.get_attribute("href")
                            if (new_img_url != None):
                                #print("URL IMG:  ", new_img_url)
                                wd.execute_script("window.open()")
                                wd.switch_to_window(wd.window_handles[1])
                                wd.get(new_img_url)
                                #wait until load new page
                                time.sleep(sleep_between_interactions)
                                wait.until(
                                    EC.presence_of_element_located(
                                        (By.TAG_NAME, "img")))
                                big_imgs = wd.find_elements_by_tag_name('img')
                                for big_img_index in range(len(big_imgs)):
                                    w, h = int(
                                        big_imgs[big_img_index].get_attribute(
                                            "width")), int(
                                                big_imgs[big_img_index].
                                                get_attribute("width"))
                                    if (w < 60 or h < 60):
                                        continue
                                    else:
                                        print(
                                            "IMG:", big_imgs[big_img_index].
                                            get_attribute("src"))
                                        image_urls.append(
                                            big_imgs[big_img_index].
                                            get_attribute("src"))
                                        break
                                wd.close()
                                wd.switch_to_window(wd.window_handles[0])
                except Exception as e:
                    print(f"ERROR - {e} (continue ...)")
                    for i in range(1, len(wd.window_handles)):
                        wd.close()
                    wd.switch_to_window(wd.window_handles[0])

            #LOAD MORE
            #Press load button if not enough imgs
            if len(image_urls) >= (self.imgs2download + imgs_offset):
                print(f"Found: {len(image_urls)} image links, done!")
                break
            else:
                print("Found:", len(image_urls),
                      "image links, looking for more ...")
                load_more_button = wd.find_element_by_css_selector(".mye4qd")
                if load_more_button:
                    wd.execute_script(
                        "document.querySelector('.mye4qd').click();")
                # move the result startpoint further down
                results_start = n_found_divs

        return image_urls, reference_imgs_urls