Ejemplo n.º 1
0
def main():
    """Get what?"""
    options = EdgeOptions()
    options.use_chromium = True
    # options.add_argument("headless")
    # options.add_argument("disable-gpu")
    driver = Edge(options=options)

    # url = 'https://www.flipkart.com/laptops-store'
    url = 'https://podbay.fm/p/sach-noi-danh-cho-ban/'

    driver.get(url)
    # driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    el = driver.find_element_by_tag_name('body')
    # el = driver.find_element(By.NAME, "Loading more").send_keys()
    for i in range(4):
        el.send_keys("webdriver" + Keys.END)
        sleep(3)
    # driver.close()

    # print(driver.title)

    # products = []  # List to store name of products
    # prices = []  # List to store price of product
    # ratings = []  # List to store ratings of product
    titles = []
    urls = []

    content = driver.page_source
    # soup = BeautifulSoup(content, 'html.parser')
    soup = BeautifulSoup(content, 'lxml')
    for a in soup.findAll('a', href=True, attrs={'class': 'jsx-1043497740'}):
        # for a in soup.findAll('a', href=True, attrs={'class': 'download'}):
        # name = a.find('div', attrs={'class': 's1Q9rs'})
        # price = a.find('div', attrs={'class': '_30jeq3'})
        # rating = a.find('div', attrs={'class': '_3LWZlK'})
        # products.append(name.text)
        # prices.append(price.text)
        # ratings.append(rating.text)

        # print(a)
        title = a.string
        if title != None:
            print(title)
            titles.append(title)

        link = a.get('href')
        if link.endswith(".mp3"):
            print(link)
            urls.append(link)

    driver.close()
    print(titles)
    print(urls)
Ejemplo n.º 2
0
def checkEmailBreached():
    email = input(default_color + "Email Address> " + reset)
    print()
    url = f"https://haveibeenpwned.com/unifiedsearch/{email}"

    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument('ignore-certificate-errors')
    options.add_argument("--log-level=OFF")
    driver = Edge(options=options,
                  executable_path=r"WebDriver\msedgedriver.exe")
    driver.get(url)
    try:
        dirty_response = \
            driver.page_source.split(
                '<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">')[1]

        raw_json = dirty_response.split("</pre></body></html>")[0]

        res = json.loads(raw_json)
        driver.close()

        print(heading_color +
              "---------------------Check For Breached Email--------------" +
              reset)

        for i in range(len(res['Breaches'])):
            print(
                content_color + f"Name        : {res['Breaches'][i]['Name']}\n"
                f"Title       : {res['Breaches'][i]['Title']}\n"
                f"Domain      : {res['Breaches'][i]['Domain']}\n"
                f"Breached On : {res['Breaches'][i]['BreachDate']}\n{reset}"
                f"{heading_color}--------------------------------------\n{reset}"
            )
        if res['Pastes'] is None:
            print(fg("red") + "[*] No Public Paste Found" + reset)
        else:
            print(heading_color + "[*] Public Paste Found\n" + reset)
            pastes = res["Pastes"]
            for i in range(len(pastes)):
                print(content_color + f"Source     : {pastes[i]['Source']}\n"
                      f"Title      : {pastes[i]['Title']}\n"
                      f"Date       : {pastes[i]['Date']}\n"
                      f"EmailCount : {pastes[i]['EmailCount']}\n")
                if pastes[i]['Source'] == "Pastebin":
                    print(
                        f"Paste URL  : https://pastebin.com/{pastes[i]['Id']}\n{reset}"
                        f"{heading_color}----------------------------------------------------{reset}"
                    )
    except:
        print(
            fg("red") + f"[*] The provided Email {email} is not breached!" +
            reset)
        pass
Ejemplo n.º 3
0
def chinahpo(hpo):
    # 如果使用IP池,则不进行随机等待
    # s = random.randint(5, 10)
    # print("等待 " + str(s) + "秒")
    # time.sleep(s)
    ip = randomIP()
    # ip = "socks5://127.0.0.1:1080"
    print("使用IP " + ip)
    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument("headless")
    # options.add_argument("disable-gpu")
    options.add_argument("--proxy-server={ip}".format(ip=ip))
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"

    driver = Edge(options=options, executable_path=msedge)
    script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    driver.execute_script(script)
    UA = randomUA()
    # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36"
    driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA})
    print(driver.execute_script("return navigator.userAgent;"))

    hpid = hpo.split(":")[1]
    url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(
        hpid=hpid)

    try:
        driver.get(url)
        strtemp = url
        print("网址:", strtemp)
    except Exception:
        print("get page error", hpo)

    time.sleep(2)
    with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f:
        f.write(str(driver.page_source))

    driver.close()
    fin = open("finish.txt", "a")
    fin.write(hpo + "\n")
    fin.close()
Ejemplo n.º 4
0
def judith(SITE):
    driver = Edge(PATH)
    driver.get(SITE)
    #find the element by his name, it can be by its ID or CSS tho
    element = driver.find_element_by_name("objetos")
    element.clear()
    #insert the apropriate tracking code and then proceeds to the next page
    element.send_keys(CODE)
    element.send_keys(Keys.RETURN)

    #must implement total headless mode and print in terminal the state
    #it needs to go here
    #in this exact spot

    #save a screenshot of the tracking progress bc im lazy right now to implement a callback terminal function
    driver.save_screenshot('consulta.png')
    #if the code is wrong, it'll show this output
    assert "Sem bagulhos encontrados, mano." not in driver.page_source
    #close the program and the browser window. BEWARE
    driver.close()
def verify_account(url):
    runner = cfscrape.create_scraper()
    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument("disable-gpu")
    options.add_argument('headless')
    options.add_argument('ignore-certificate-errors')
    driver = Edge(options=options,
                  executable_path=r"WebDriver\msedgedriver.exe")
    driver.get(url)

    link = driver.find_element_by_xpath(
        r"//a[@style='text-decoration: none; color: #1ed760']").get_attribute(
            'href')
    driver.close()

    result = runner.get(url=link).text
    if "all set" in result:
        window["Verified_Email"].print(f"[*] Verification Completed")
    else:
        window["Verified_Email"].print(f"[*] {result}")
mswebdriverpath = parseargs.getMSWebDriverPath()
number_of_searches = parseargs.getNumSearches()
start_number = parseargs.getStartNum()

#open config.json and get user_data_dir
config_path = os.path.join('config',
                           'config-' + socket.gethostname() + '.json')
with open(config_path) as json_data_file:
    config = json.load(json_data_file)

user_data_dir = config["user.data.dir.edge"]
print("user.data.dir.edge: " + user_data_dir)

#open edge and get going!
desired_cap = {
    "args": ["userDataDir=/tmp/temp_profile"],
    "userDataDir": "/tmp/temp_profile"
}

browser = Edge(executable_path=mswebdriverpath, capabilities=desired_cap)

#go to Bing
browser.get("http://www.bing.com")

searchText = "test"

SearchUtil.runSearches(browser, searchText, number_of_searches, start_number)

browser.close()
Ejemplo n.º 7
0
class QCourse:
    def __init__(self):
        # 初始化options
        self.prefs = {"download.default_directory": os.getcwd()}
        self.options = EdgeOptions()
        self.options.use_chromium = True
        self.options.add_argument("log-level=3")
        self.options.add_experimental_option('excludeSwitches',
                                             ['enable-logging'])
        self.options.add_experimental_option('prefs', self.prefs)
        self.options.add_argument("--mute-audio")

        self.login_url = 'https://ke.qq.com/'

        # Mac 下配置 options 报错,故扔掉了。如果是 Windows,请使用路径下面的 msedgedriver.exe。(注释掉下面一行,放开下下行)
        self.driver = Edge(executable_path=os.path.join(
            BASE_DIR, 'msedgedriver'),
                           capabilities={})
        # self.driver = Edge(executable_path='msedgedriver.exe', options=self.options)

        # self.driver = Edge(executable_path=os.path.join(BASE_DIR, 'msedgedriver'), capabilities=desired_cap, options=self.options)

    def login(self):
        self.driver.get('https://ke.qq.com/')
        self.driver.find_element_by_id('js_login').click()
        time.sleep(1)

        WebDriverWait(self.driver, 300).until_not(
            EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask')))

        dictCookies = self.driver.get_cookies()
        jsonCookies = json.dumps(dictCookies)
        with open('cookies.json', 'w') as f:
            f.write(jsonCookies)
        print('登陆成功!')

    def close(self):
        self.driver.close()

    def _get_video(self, video_url=None, path=None, index=None):
        if not video_url:
            print('请输入视频url!')
        # 跳转一次没法跳转,可能是设置了preventDefault
        self.driver.get(video_url)
        self.driver.get(video_url)
        try:
            # 等待视频开始播放
            WebDriverWait(self.driver, 60).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'loki-time')))
            WebDriverWait(
                self.driver,
                60).until_not(lambda driver: driver.find_element_by_class_name(
                    'loki-time').get_attribute("innerHTML") == '00:00 / 00:00')

            title = self.driver.title
            if index is not None:
                title = "{:02}_{}".format(index, title)

            networks = self.driver.execute_script(
                'return window.performance.getEntries()')
            ts_url = key_url = ''
            for network in networks:
                if '.ts?start' in network.get('name'):
                    ts_url = network.get('name')
                elif 'get_dk' in network.get('name'):
                    key_url = network.get('name')
            download_single(ts_url, key_url, title, path)
        except TimeoutException:
            # 如果超时,可能是下载的资料,则查看是否有下载按钮,有的话,就下载
            title = self.driver.title
            try:
                down_btn = self.driver.find_element_by_class_name(
                    'download-btn')
                if down_btn.text == '下载资料':
                    url = down_btn.get_attribute('href')
                    download_zip_doc(url, title, path)
            except Exception:
                print('没有找到视频,也没有找到可下载的文件,可能是还未开课')

    def get_video(self, video_url=None, path=None, index=None):
        if isinstance(video_url, list):
            for url in video_url:
                if url:
                    self._get_video(url, path, index)
        else:
            self._get_video(video_url, path, index)

    def load_cookies(self):
        if not os.path.exists('cookies.json'):
            self.login()
        with open('cookies.json', 'r') as f:
            listCookies = json.loads(f.read())
        self.driver.get(self.login_url)
        for cookie in listCookies:
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': cookie['httpOnly'],
                'name': cookie['name'],
                'path': '/',
                'secure': cookie['secure'],
                'value': cookie['value']
            })
        for cookie in utils.get_cookies_dic_list():
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': False,
                'name': cookie[0],
                'path': '/',
                'secure': False,
                'value': cookie[1]
            })
Ejemplo n.º 8
0
def chooseAccount():
    with open('data.txt') as json_file:
        data = json.load(json_file)

    userInfo ='account: ' + data['username']
    print(userInfo) 
    
    userName = data['username']
    passWord = data['password']
    print("link:")
    link = input()
    print("number of photos: ")
    amount = input()

    # format text and amount
    amount = int(amount)

    # auto login
    options = EdgeOptions()
    options.use_chromium = True
    options.add_argument('headless')
    driver = Edge('msedgedriver', options = options)
    driver.get(link)
    time.sleep(2)
    userForm = driver.find_element_by_css_selector("input[name='username']")
    passForm = driver.find_element_by_css_selector("input[name='password']")
    userForm.send_keys(userName)
    passForm.send_keys(passWord)
    driver.find_element_by_css_selector("button[type='submit']").click()
    time.sleep(3)
    driver.execute_script("document.querySelector('.sqdOP.yWX7d.y3zKF').click()")

    # get link image to list
    time.sleep(2)
    if amount > 1: 
        spriteBtn = driver.find_element_by_css_selector(".coreSpriteRightChevron")
    list_link = []
    def get_url1():
        list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']")
        for image in list_element[:1]:
            src = image.get_attribute("src")
            list_link.append(src)
    def get_url2():
        list_element = driver.find_elements_by_css_selector("img[style='object-fit: cover;']")
        list_element.pop(0)
        for image in list_element[:1]:
            src = image.get_attribute("src")
            list_link.append(src)

    for x in range(0, amount+1):
        if (len(list_link) > 0):
            get_url2()
        else:
            get_url1()
        if len(list_link) == amount:
            break
        elif spriteBtn:
            spriteBtn.click()
        else:
            break
        time.sleep(0.5)

    # check old image folder exist
    if (os.path.isdir("./image")): 
        rmtree("./image")

    # create new image folder
    folderPath = os.getcwd()
    folderPath += '\image'
    os.mkdir(folderPath)

    # clear screen
    clear = lambda: os.system('cls')
    clear()

    for i in tqdm(range(100)):
        pass

    print("\nnumber of photos:", len(list_link))

    pos = 0
    for href in list_link:
        print(pos+1, "DONE")
        imagePathResult = "./image/image_" + str(pos) + ".png"
        try:
            downloadFile(href)
            copy("./image/image.png", imagePathResult)
        except:
            print("error at %s" %pos+1)
        pos += 1
    os.remove("./image/image.png")

    resultPath = os.getcwd()
    resultPath = resultPath + '\image'
    os.startfile(resultPath)
    
    driver.close()
    chooseMenu()
    if (os.path.isfile(path)):
        key = 2
    else:
        key = 1
    menu(key)
Ejemplo n.º 9
0
class QCourse:
    def __init__(self):
        # 初始化options
        self.prefs = {"download.default_directory": os.getcwd()}
        self.options = EdgeOptions()
        self.options.use_chromium = True
        self.options.add_argument("log-level=3")
        self.options.add_experimental_option('excludeSwitches',
                                             ['enable-logging'])
        self.options.add_experimental_option('prefs', self.prefs)
        self.options.add_argument("--mute-audio")

        self.login_url = 'https://ke.qq.com/'

        self.driver = Edge(executable_path='msedgedriver.exe',
                           options=self.options)

    def login(self):
        self.driver.get('https://ke.qq.com/')
        self.driver.find_element_by_id('js_login').click()
        time.sleep(1)

        WebDriverWait(self.driver, 300).until_not(
            EC.presence_of_element_located((By.CLASS_NAME, 'ptlogin-mask')))

        dictCookies = self.driver.get_cookies()
        jsonCookies = json.dumps(dictCookies)
        with open('cookies.json', 'w') as f:
            f.write(jsonCookies)
        print('登陆成功!')

    def close(self):
        self.driver.close()

    def get_video(self, video_url=None, path=None):
        if not video_url:
            print('请输入视频url!')
        # os.chdir(BASE_DIR)
        if not os.path.exists('cookies.json'):
            self.login()
        with open('cookies.json', 'r') as f:
            listCookies = json.loads(f.read())
        self.driver.get(video_url)
        for cookie in listCookies:
            self.driver.add_cookie({
                'domain': '.ke.qq.com',
                'httpOnly': cookie['httpOnly'],
                'name': cookie['name'],
                'path': '/',
                'secure': cookie['secure'],
                'value': cookie['value']
            })
        self.driver.get(video_url)
        # 等待视频开始播放
        WebDriverWait(self.driver, 300).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'loki-time')))
        WebDriverWait(
            self.driver,
            300).until_not(lambda driver: driver.find_element_by_class_name(
                'loki-time').get_attribute("innerHTML") == '00:00 / 00:00')

        networks = self.driver.execute_script(
            'return window.performance.getEntries()')
        ts_url = key_url = ''
        for network in networks:
            if '.ts?start' in network.get('name'):
                ts_url = network.get('name')
            elif 'get_dk' in network.get('name'):
                key_url = network.get('name')
        title = self.driver.title
        # catalog = self.driver.execute_script('return document.getElementsByClassName("task-item task-info active")'
        #                                      '[0].parentNode.firstElementChild.innerText')
        # os.chdir(os.path.join(os.getcwd(), catalog))
        download_single(ts_url, key_url, title, path)
Ejemplo n.º 10
0
def chinahpo(hpo_queue):

    while hpo_queue.empty() is not True:
        hpo = hpo_queue.get()

        # 如果使用IP池,则不进行随机等待
        s = random.randint(5, 10)
        print(hpo, "等待 " + str(s) + "秒")
        time.sleep(s)
        ip = randomIP()
        # ip = "socks5://127.0.0.1:1080"
        hpo_ip = hpo + "\t" + ip
        print(hpo_ip)
        options = EdgeOptions()
        options.use_chromium = True
        options.add_argument("headless")
        # options.add_argument("disable-gpu")
        options.add_argument("--proxy-server=http://{ip}".format(ip=ip))
        options.add_argument("--disable-blink-features")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_experimental_option("excludeSwitches",
                                        ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)

        geo = get_timezone_geolocation(ip)
        print(geo)
        geo_json = {"latitude": geo[1], "longitude": geo[2], "accuracy": 1}
        timezone = {"timezoneId": geo[0]}

        preferences = {
            "webrtc.ip_handling_policy": "disable_non_proxied_udp",
            "webrtc.multiple_routes_enabled": False,
            "webrtc.nonproxied_udp_enabled": False
        }
        options.add_experimental_option("prefs", preferences)

        msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"

        driver = Edge(options=options, executable_path=msedge)
        script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
        driver.execute_script(script)
        UA = UserAgent().random
        # UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36"
        driver.execute_cdp_cmd("Network.setUserAgentOverride",
                               {"userAgent": UA})
        driver.execute_cdp_cmd("Emulation.setGeolocationOverride", geo_json)
        driver.execute_cdp_cmd("Emulation.setTimezoneOverride", timezone)

        print(driver.execute_script("return navigator.userAgent;"))

        hpid = hpo.split(":")[1]
        url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(
            hpid=hpid)

        try:
            driver.get(url)
            strtemp = url
            print("网址:", strtemp)
        except Exception:
            print("get page error", hpo)

        time.sleep(2)
        with open("html2/hp_" + hpid + ".html", "a+", encoding="utf-8") as f:
            f.write(str(driver.page_source))

        driver.close()
        fin = open("finish.txt", "a")
        fin.write(hpo + "\n")
        fin.close()

        size = getDocSize("html2/hp_" + hpid + ".html")
        if 9000 <= size <= 15000:
            checkIP = open("ip_check_better.txt", "a")
            checkIP.write(hpo_ip + "\n")
            checkIP.close()
Ejemplo n.º 11
0
def main():
    args = sys.argv
    f = open(args[4], "r")
    Lines = f.readlines()
    names, profession, nationality, job = [], [], [], []
    for line in Lines:
        array = line.split(",")
        names.append(array[0])
        profession.append(array[1])
        nationality.append(array[2])
        job.append(array[3].replace("\n", ""))
    for name in names:
        print("Query:", name, ".\nProcessing...")
        user = '******'
        search_term = f'{name} filter:verified'
        options = EdgeOptions()
        options.use_chromium = True
        driver = Edge(options=options)
        driver.get('https://www.twitter.com/login')
        driver.maximize_window()
        sleep(2)
        username = driver.find_element_by_xpath(
            '//input[@name="session[username_or_email]"]')
        username.send_keys(user)
        password = driver.find_element_by_xpath(
            '//input[@name="session[password]"]')
        password.send_keys('donkey123')
        password.send_keys(Keys.RETURN)
        sleep(1)
        search_input = driver.find_element_by_xpath(
            '//input[@aria-label="Search query"]')
        search_input.send_keys(search_term)
        search_input.send_keys(Keys.RETURN)
        sleep(1)
        driver.find_element_by_link_text('People').click()
        sleep(3)
        driver.find_element_by_xpath(
            '//div[@class="css-1dbjc4n r-j7yic r-qklmqi r-1adg3ll r-1ny4l3l"]'
        ).click()
        sleep(3)
        data = []
        tweet_data = []
        start = 0
        end = 500
        for i in range(0, 5):
            sleep(1)
            cards = driver.find_elements_by_xpath(
                '//div[@data-testid="tweet"]')
            card = cards[i]
            tweet = get_tweet_data(card)
            for card in cards:
                data = get_tweet_data(card)
                if data:
                    tweet_data.append(data)
            driver.execute_script(f'window.scrollTo({start},{end});')
            start += 500
            end += 500
        driver.close()
        tweets = set(tweet_data)
        write_to_csv(name, tweets)
        df = pd.read_csv(f'{name}.csv')
        Twitter_sentiment = Twitter_sentiment_model(df)
        Twitter_toxic = Twitter_toxic_model(df)
        Big5 = Big5_model(df)

        create_report(name, tweets, Twitter_sentiment, Twitter_toxic, Big5)
Ejemplo n.º 12
0
class Sei:

    __area_inicial = None
    __windows_before = 0
    __windows_after = 0

    def __init__(self, headless=False, executable_path='chromedriver'):
        if 'chromedriver' in executable_path:
            chrome_options = Options()
            chrome_options.add_argument('--enable-javascript')
            chrome_options.add_argument('--window-size=1440,900')
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--proxy-server='direct://'")
            chrome_options.add_argument("--proxy-bypass-list=*")
            chrome_options.add_argument("--start-maximized")
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--ignore-certificate-errors')
            if headless:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
            self.driver = webdriver.Chrome(executable_path=executable_path,
                                           options=chrome_options)
        elif 'msedgedriver' in executable_path:
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            edge_options.add_argument('enable-javascript')
            edge_options.add_argument('window-size=1440,900')
            edge_options.add_argument("disable-extensions")
            edge_options.add_argument("proxy-server='direct://'")
            edge_options.add_argument("proxy-bypass-list=*")
            edge_options.add_argument("start-maximized")
            edge_options.add_argument('disable-dev-shm-usage')
            edge_options.add_argument('no-sandbox')
            edge_options.add_argument('ignore-certificate-errors')
            if headless:
                edge_options.add_argument('headless')
                edge_options.add_argument('disable-gpu')
            self.driver = Edge(executable_path=executable_path,
                               options=edge_options)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def start_driver(self, url, usuario=None, senha=None):

        if usuario == None:
            usuario = input('Digite o usuário: ')
        if senha == None:
            senha = getpass('Digite a senha: ')

        self.driver.get(url)

        usuario_field = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "txtUsuario")))

        senha_field = self.driver.find_element_by_id('pwdSenha')
        botao_acessar = self.driver.find_element_by_id('sbmLogin')

        usuario_field.clear()
        usuario_field.send_keys(usuario)
        senha_field.clear()
        senha_field.send_keys(senha)
        botao_acessar.click()
        alerta = self.fechar_alerta()
        if alerta:
            raise Exception(alerta)  # usuário ou senha inválido
        self.__area_incial = self.get_area()

    def go_to(self, numero_sei):
        if self.__windows_after > self.__windows_before:
            self.driver.close()
            self.driver.switch_to.window(
                self.driver.window_handles[self.__windows_before - 1])
        self.driver.switch_to.default_content()
        pesquisa = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "txtPesquisaRapida")))
        pesquisa.clear()
        pesquisa.send_keys(str(numero_sei))
        formPesquisaRapida = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located(
                (By.ID, "frmProtocoloPesquisaRapida")))
        self.__windows_before = len(self.driver.window_handles)
        formPesquisaRapida.submit()
        self.__windows_after = len(self.driver.window_handles)
        if self.__windows_after > self.__windows_before:
            self.driver.switch_to.window(
                self.driver.window_handles[self.__windows_after - 1])

    def is_processo_aberto(self, area=None, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        try:
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            informacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "divInformacao")))
            mensagem = informacao.text
            aberto = 'aberto' in mensagem
            if area:
                regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$'
                matches = search(regex, mensagem)
                if matches:
                    aberto = True
                else:
                    aberto = False
            self.driver.switch_to.default_content()
        except:
            aberto = None
            mensagem = 'Impossível abrir mensagem do processo'
        return aberto, mensagem

    def get_processo_anexador(self, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        informacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divInformacao")))
        procAnex = None
        if 'Processo anexado ao processo' in informacao.text:
            processoAnexador = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//*[@id=\"divInformacao\"]/div/a")))
            procAnex = processoAnexador.text
        self.driver.switch_to.default_content()
        return procAnex

    def get_area(self):
        self.driver.switch_to.default_content()
        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        return select.all_selected_options[0].text

    def seleciona_area(self, area):
        self.driver.switch_to.default_content()
        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        all_selected_options = select.all_selected_options
        for option in all_selected_options:
            if area == option.text:
                return True

        select = Select(self.driver.find_element_by_id('selInfraUnidades'))
        options = select.options
        for option in options:
            if area == option.text:
                select.select_by_visible_text(area)
                Select(
                    WebDriverWait(self.driver, 3).until(
                        EC.presence_of_element_located(
                            (By.ID, 'selInfraUnidades'))))
                return True

        return False

    def clicar_botao(self, botao):
        self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        arvore = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divArvoreAcoes")))
        botoes = arvore.find_elements(By.XPATH,
                                      '//*[@id=\"divArvoreAcoes\"]/a')

        for b in botoes:
            img = b.find_element(By.XPATH, 'img')
            if botao in img.get_attribute('title'):
                b.click()
                try:
                    WebDriverWait(self.driver, 1).until(
                        EC.alert_is_present(),
                        'Timed out waiting for PA creation ' +
                        'confirmation popup to appear.')
                except:
                    try:
                        self.driver.switch_to.default_content()
                    except:
                        None
                return True
        return False

    def fechar_alerta(self):
        alerta = None
        try:
            WebDriverWait(self.driver, 3).until(
                EC.alert_is_present(), 'Timed out waiting for PA creation ' +
                'confirmation popup to appear.')
            alert = self.driver.switch_to.alert
            alerta = alert.text
            alert.accept()
            self.driver.switch_to.default_content()
        except TimeoutException:
            None
        return alerta

    def is_sobrestado(self, area=None, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        ifrVisualizacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
        self.driver.switch_to.frame(ifrVisualizacao)
        informacao = WebDriverWait(self.driver, 3).until(
            EC.presence_of_element_located((By.ID, "divInformacao")))
        sobrestado = 'sobrestado' in informacao.text
        mensagem = informacao.text
        self.driver.switch_to.default_content()
        if area:
            regex = '(?im)^(.*)(' + area + ')[^0-9a-z](.*)$'
            matches = search(regex, informacao.text)
            return sobrestado, matches != None
        else:
            return sobrestado, mensagem

    def sobrestar_processo(self, motivo, processo=None):
        if processo:
            self.go_to(processo)
        else:
            self.driver.switch_to.default_content()
        if self.clicar_botao('Sobrestar Processo'):
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            self.driver.find_element(By.ID, 'divOptSomenteSobrestar').click()
            motivoField = self.driver.find_element(By.ID, 'txaMotivo')
            motivoField.clear()
            motivoField.send_keys(motivo)
            self.driver.find_element(By.ID, 'sbmSalvar').click()
            self.driver.switch_to.default_content()
            return True
        return False

    def remover_sobrestamento(self, processo=None):
        if processo:
            self.go_to(processo)
        if self.clicar_botao('Remover Sobrestamento do Processo'):
            self.fechar_alerta()
            return True
        return False

    def publicar(self,
                 resumo_ementa,
                 data_disponibilizacao,
                 documento=None,
                 dou=False,
                 secao=None,
                 pagina=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        if self.clicar_botao('Agendar Publicação'):
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)

            resumo_ementa_text_field = self.driver.find_element(
                By.ID, 'txaResumo')
            resumo_ementa_text_field.clear()
            resumo_ementa_text_field.send_keys(resumo_ementa)

            disponibilizacao = self.driver.find_element(
                By.ID, 'txtDisponibilizacao')
            disponibilizacao.clear()
            disponibilizacao.send_keys(data_disponibilizacao)

            if dou:
                select = Select(self.driver.find_element_by_id('selVeiculoIO'))
                select.select_by_visible_text('DOU')

                select = Select(
                    WebDriverWait(self.driver, 3).until(
                        EC.presence_of_element_located((By.ID, "selSecaoIO"))))
                WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR,
                         "option[value='" + secao if secao else '3' + "']")))
                select.select_by_visible_text(secao if secao else '3')

                pagina_text_field = self.driver.find_element(
                    By.ID, 'txtPaginaIO')
                pagina_text_field.clear()
                pagina_text_field.send_keys(pagina if pagina else '')

                disponibilizacao = self.driver.find_element(By.ID, 'txtDataIO')
                disponibilizacao.clear()
                disponibilizacao.send_keys(data_disponibilizacao)

            self.driver.find_element_by_id('btnSalvar').click()

            self.driver.switch_to.default_content()
            return True
        return False

    def get_conteudo_documento(self, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
            self.driver.switch_to.frame(ifrVisualizacao)
            ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
            self.driver.switch_to.frame(ifrArvoreHtml)
            documento_conteudo = self.driver.find_element_by_xpath(
                '/html/body').get_attribute('innerHTML')
            documento_conteudo = sub(
                r'\\n', '', documento_conteudo)  # retirar quebra de páginas
            documento_conteudo = sub(r'\s\s+?', ' ',
                                     documento_conteudo)  # tira espaços duplos
            documento_conteudo = sub(r'&nbsp;', ' ',
                                     documento_conteudo)  # tira espaços duplos
            documento_conteudo = documento_conteudo.strip(
            )  # retirar quebras de páginas que tenham restado
            return documento_conteudo
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_element_by_id(self, id, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            return self.driver.find_element_by_id(id).text
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_elements_by_id(self, id, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            elements = self.driver.find_elements_by_id(id)
            return [element.text for element in elements]
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_element_by_xpath(self, xpath, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            return self.driver.find_element_by_xpath(xpath).text
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def get_documento_elements_by_xpath(self, xpath, documento=None):
        if documento:
            self.go_to(documento)
        else:
            self.driver.switch_to.default_content()
        try:
            if (self.__windows_after == self.__windows_before):
                ifrVisualizacao = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrVisualizacao")))
                self.driver.switch_to.frame(ifrVisualizacao)
                ifrArvoreHtml = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located((By.ID, "ifrArvoreHtml")))
                self.driver.switch_to.frame(ifrArvoreHtml)
            elements = self.driver.find_elements_by_xpath(xpath)
            return [element.text for element in elements]
        except:
            raise Exception('Conteúdo do documento %s não encontrado.' %
                            documento)
        finally:
            self.driver.switch_to.default_content()

    def close(self, voltar=True):
        if voltar:
            self.seleciona_area(self.__area_incial)
        self.driver.close()
        self.driver.quit()
Ejemplo n.º 13
0
def main():
    try:
        print("ウェブドライバーを立ち上げています・・・")
        port = str(args.port[0])
        load_delay_time = args.load_delay_time[0]
        options = EdgeOptions()
        options.use_chromium = True
        driver = Edge(options=options)
        driver.maximize_window()

        if len(port) != 4:
            print("入力した番号は4桁ではないです。4桁のポート番号を記入してください。")
            quit()

        print("ページを開いています・・・")
        driver.get(f"http://127.0.0.1:{port}")
        print(f"ページの読み込みのため{str(load_delay_time)}秒待機します・・・")

        for i in range(load_delay_time, 0, -1):
            time.sleep(1)
            print(f"終わるまで{i}秒")

        print("Interactive Pythonコンソールを立ち上げています・・・")

        soup = BeautifulSoup(driver.page_source, features="lxml")
        #Define web elements to be tested as dictionary where element ids are the keys.
        test_element_ids = {
            "dtFilter": {
                "tag":
                "select",
                "click_el_xpath":
                "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[1]/div/div/div"
            },
            "maxAmount": {
                "tag": "input",
            },
            "maxSigma": {
                "tag": "input",
            },
            "pl": {
                "tag":
                "select",
                "click_el_xpath":
                "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[5]/div/div/div"
            },
            "reason": {
                "tag":
                "select",
                "click_el_xpath":
                "/html/body/div/div[1]/div[2]/div/div/div[1]/div/div/div[1]/div[6]/div/div/div/div[1]"
            }
        }
        for test_el_id in test_element_ids:
            test_el = test_element_ids[test_el_id]
            if test_el["tag"] == "select":
                el = driver.find_element_by_xpath(test_el["click_el_xpath"])
                el.click()
                soup = BeautifulSoup(driver.page_source, features="lxml")
                select_items = [
                    tag.text for tag in soup.find(
                        id=test_el_id).find_next_sibling().select("div.option")
                ]
                print(f"number of items in select box: {len(select_items)}")
                for select_item in select_items:
                    click_el = driver.find_element_by_css_selector(
                        f"[data-value='{select_item}']")
                    el.click()
                    click_el.click()
                    time.sleep(5)
            elif test_el["tag"] == "input":
                test_round = 1
                while test_round < 6:
                    test_input_number = int(random.random() * random.choice([
                        10, 100, 1000, 10000, 100000, 1000000, 10000000,
                        10000000, 100000000
                    ]))
                    el = driver.find_element_by_id(test_el_id)
                    el.clear()
                    el.click()
                    el.send_keys(test_input_number)
                    time.sleep(5)
                    test_round += 1
                el.clear()
                el.send_keys(0)
    except Exception as e:
        print(
            f"(EXCEPT) An error occurred: {str(e)} Attempting to enter debug mode at point of error."
        )
        embed()
    finally:
        print("プログラムが正常終了しました。ウェブドライバーを終了します。お疲れ様でした。")
        embed()
        driver.close()
Ejemplo n.º 14
0
class Settings:
    """Defines the prerequisites and starts the scraping process.
    
    The arguments represent the default values for each instance variable.

    Setting the user's LinkedIn credentials directly from the class variables 
    is allowed, they're also accessible via an external module, or use 
    the os module to access them from your system environment variables.

    Args:
        df_dir(str): Directory of the original file containing the dataframe.
        df_read(str): Relative path of the original dataframe file.
        df_file_name(str): Name for the dataframe output file.
        cookies_path(str): Path where your LinkedIn session cookies are stored.
        driver_path(str): Webdriver's path.

    Attributes:
        email(str): User's LinkedIn email or phone number.
        password(str): User's LinkedIn password.
        self.read_file(str): Existing file containing pandas readable dataframe.
    """
    email: Optional[str] = None
    password: Optional[str] = None
    read_file: Optional[str] = None
    driver_path: Optional[str] = None

    def __init__(self,
                 df_dir='src/data',
                 df_path='data/dataframe.csv',
                 df_file_name='new_dataframe.csv',
                 cookies_path='src/cookies/cookies.pkl',
                 driver_path='',
                 _driver=None,
                 _check=False):
        self.df_dir = df_dir
        self.df_path = df_path if self.read_file is None else f'data/{self.read_file}'
        self.df_file_name = df_file_name
        self.cookies_path = cookies_path
        self.driver_path = driver_path
        self.driver = _driver
        self.check = _check

        if self.driver is None:
            # Selenium capabilities and other settings
            options = Options()

            # Options for microsoft edge (chromium)
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            edge_options.add_argument('log-level=3')
            edge_options.add_argument('lang=en')
            edge_options.add_argument('--start-maximized')

            # Main webdriver
            self.driver = Edge(executable_path=self.driver_path,
                               options=edge_options)

        self._check_connection()

    def _check_connection(self, tries=0, max_try=10, err=None):
        """Make initial network stability check.
        """
        while not self.check:
            if tries == max_try:
                break
            try:
                requests.get('https://www.google.com/')
                self.driver.get('https://www.google.com/')
                while 'Google' not in self.driver.title:
                    time.sleep(0.1)
                self.check = True
            except Exception as e:
                err = e
                tries += 1

        if not self.check:
            print(f'({__name__}) Tries: {max_try}', err)
            self.driver.close()
        else:
            self.scraper_logic_handler()

    def csv_to_df(self, row):
        """Convert bytes in memory buffer (i.e. csv file data) to a pandas dataframe.
        """
        output = pd.read_csv(io.BytesIO(row))
        return output

    def edit_dataframe(self, row, df, scraped_info):
        """Edit the active dataframe with the scraped information.
        """
        headers = [
            'URL', 'title', 'role', 'current company', 'location', 'website',
            'twitter', 'email', 'industry', 'company url', 'company size',
            'specialties'
        ]
        for header, info in zip(headers, scraped_info):
            info_index = scraped_info.index(info)
            if info_index == 4 and any(map(str.isdigit, str(info))):
                scraped_info[info_index] = 'None'
            df.at[row, header] = str(scraped_info[info_index])
        return df

    def define_search(self, row, df, site='site:linkedin.com', search=[]):
        """Take a row from the dataframe and convert it to a search value.
        """
        search.clear()
        search = df.iloc[row]
        name = search[:].values[0]
        search = (f'\t{search[:].values[1]}\t{search[:].values[2]}').split()
        search.insert(0, name)
        search.insert(len(search) - 1 + 1, site + f' intitle:{name}')
        return search

    def select_pandas_io(self, df_file):
        """Determine the df_file extention for which pandas I/O to use.
        """
        pandas_io_dict = {'.csv': self.csv_to_df}
        _, ext = os.path.splitext(df_file)
        row = pkgutil.get_data('src', df_file)
        df = pandas_io_dict[ext](row)
        return df

    def repack_to_csv(self, df):
        """Write the dataframe to a file.
        """
        try:
            new_df_path = os.path.join(self.df_dir, self.df_file_name)
            _ = df.to_csv(new_df_path, index=False)
            return True
        except Exception as e:
            print(e)
        return False

    def scraper_logic_handler(self):
        """Main handler for the entire scraping process.
        """
        username = self.email
        password = self.password
        cookies = self.cookies_path
        driver = self.driver
        ready = self.check
        scraped_info = None

        if ready:
            print('''\nGeneral tip: 
                Do not minimize the webdriver while it is running.
                This will allow some elements to properly load.''')
            df = self.select_pandas_io(self.df_path)

            session = SignIn.sign_in(driver, username, password, cookies)
            while session is None:
                time.sleep(0.1)

            for row in df.index:
                if df.at[row, 'URL'] == 'None':
                    search = self.define_search(row, df)
                    scraped_info = Scrape(driver, search).startstop()
                    df = self.edit_dataframe(row, df, scraped_info)
                else:
                    pass

            if self.repack_to_csv(df):
                driver.close()

    @classmethod
    def start_scraper(cls,
                      user_email=None,
                      user_password=None,
                      read_file='',
                      driver_path=''):
        if isinstance(user_email,
                      str) and isinstance(user_password, str) and isinstance(
                          read_file, str) and isinstance(driver_path, str):
            cls.email = user_email
            cls.password = user_password
            cls.read_file = read_file
            cls.driver_path = driver_path
            row = cls()
            return row
        else:
            print(
                f'({__name__}) Invalid DataType: {user_email, type(user_email)}, {user_password, type(user_password)}, {read_file, type(read_file)}, {driver_path, type(driver_path)}'
            )
Ejemplo n.º 15
0
                            else:
                                idtweet = list(tweet)
                                idtweet[2]=str(idtweet[2])
                                tweet_id = ''.join(idtweet)
                                if tweet_id not in tweet_ids:
                                    tweet_ids.add(tweet_id)
                                    data.append(tweet)
                    
            scroll_attempt = 0
            while True:
                # check scroll position
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                sleep(2)
                curr_position = driver.execute_script("return window.pageYOffset;")
                if last_position == curr_position:
                    scroll_attempt += 1
                    
                    # end of scroll region
                    if scroll_attempt >= 3:
                        scrolling = False
                        break
                    else:
                        sleep(2) # attempt another scroll
                else:
                    last_position = curr_position
                    break
    
        # close the web driver
        driver.close()

Ejemplo n.º 16
0
class ReservationEngine:
    def __init__(self, email, password, headless=True):
        self.email = email
        self.password = password
        self.available = False
        self.booked = False
        self.reservations_left = False
        options = EdgeOptions()
        options.add_argument("--log-level=3")
        options.use_chromium = True
        if headless:
            options.add_argument("headless")
        self.driver = Edge(options=options)
        print("Starting web driver...")

    def remove_overlay(self):
        #get rid of cc overlay
        buttons = self.driver.find_elements_by_css_selector("a.cc-btn")
        while any(map(lambda x: x.size["height"] != 0, buttons)):
            for button in buttons:
                try:
                    button.click()
                except:
                    pass
            buttons = self.driver.find_elements_by_css_selector("a.cc-btn")

    def login(self):
        #login
        print("Logging in")
        self.driver.get(
            "https://account.ikonpass.com/en/login?redirect_uri=/en/myaccount/add-reservations/"
        )
        self.remove_overlay()
        email_box = self.driver.find_element_by_css_selector("input#email")
        email_box.send_keys(self.email)
        password_box = self.driver.find_element_by_css_selector(
            "input#sign-in-password")
        password_box.send_keys(self.password)
        submit = self.driver.find_element_by_css_selector("button.submit")
        submit.click()
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'input.react-autosuggest__input')))
        print("Logged in")

    def refresh(self):
        self.driver.refresh()

    def find_date(self, date, resort):
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'input.react-autosuggest__input')))
        self.remove_overlay()
        #select resort
        search = self.driver.find_element_by_css_selector(
            "input.react-autosuggest__input")
        search.send_keys(resort)
        button = self.driver.find_element_by_css_selector(
            "li#react-autowhatever-resort-picker-section-1-item-0")
        button.click()
        button = self.driver.find_element_by_xpath(
            "//span[contains(text(), 'Continue')]")
        button.click()

        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div.DayPicker-wrapper')))
        self.remove_overlay()

        #select date
        datepicker = self.driver.find_element_by_css_selector(
            "div.DayPicker-wrapper")
        month_selected = False
        while not month_selected:
            month_text = calendar.month_name[date.month]
            month = datepicker.find_elements_by_xpath(
                "//span[contains(text(), " + "'" + month_text + "')]")
            if len(month) > 0:
                month_selected = True
            else:
                button = datepicker.find_element_by_class_name(
                    "icon-chevron-right")
                button.click()

        day = datepicker.find_element_by_xpath("//div[@aria-label='" +
                                               date.strftime("%a %b %d %Y") +
                                               "']")
        day.click()
        day_classes = day.get_attribute(name="class")

        self.available = "past" not in day_classes and "unavailable" not in day_classes
        self.booked = "confirmed" in day_classes
        div = self.driver.find_elements_by_xpath(
            "//div[contains(text(), 'Reservation Limit Reached')]")
        self.reservations_left = len(div) == 0
        print("Date Selected: " + date.strftime("%m/%d/%Y"))

    def reserve(self):
        #confirm reservation if available
        if self.available and not self.booked and self.reservations_left:
            self.remove_overlay()
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Save')]")
            button.click()
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Continue to Confirm')]")
            button.click()

            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//input[@type='checkbox']")))
            button = self.driver.find_element_by_xpath(
                "//input[@type='checkbox']")
            button.click()
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                     "//span[contains(text(), 'Confirm Reservations')]")))
            button = self.driver.find_element_by_xpath(
                "//span[contains(text(), 'Confirm Reservations')]")
            button.click()
            self.booked = True
            print("Booked")
        return self.booked

    def log_results(self, log_file_name):
        #log
        with open(log_file_name, "a") as f:
            f.write(datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
            f.write(": Available - %r, Booked - %r, Reservations Left- %r" %
                    (self.available, self.booked, self.reservations_left))
            f.write("\n")

    def close_driver(self):
        self.driver.close()
Ejemplo n.º 17
0
class CnblogsSpider(scrapy.Spider):
    # spider name
    name = 'cnblogs'

    # allowed domains
    allowed_domains = ['i.cnblogs.com', 'www.cnblogs.com']

    # list pages
    start_urls = [
        'https://i.cnblogs.com/api/posts/list?p=1&cid=&tid=&t=1&cfg=0&search='
    ]

    page_size = 100

    # cookies
    cn_blogs_cookie = {
        '.CNBlogsCookie':
        'D1BE43FCE6861944C20286B08281F79D48AA2C47E3144A7E4E9429AE26B66C17071AFADFBDB7F45E7D85583CF6AA07CA0CCD6512B0DC01B7C5D5CB774D867B3E70A3FFC843EA90AF218C30B44D1979320533B0D6D9C4E6BBC5FBC337ED7E1663E832CC7A'
    }

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    # init browser driver in constructor
    # 将driver创建在中间件的初始化方法中,适合项目中只有一个爬虫。
    # 爬虫项目中有多个爬虫文件的话,将driver对象的创建放在每一个爬虫文件中。
    def spider_opened(self, spider):
        #  # 在scrapy中创建driver对象,尽可能少的创建该对象。
        #  # 1. 在初始化方法中创建driver对象;
        #  # 2. 在open_spider中创建driver对象;
        #  # 3. 不要将driver对象的创建放在process_request();
        options = EdgeOptions()

        # # 使用谷歌内核(加了反而报错,说chrome is not reachable,并且此时driver名字必须为msedgedriver.exe,正常应该必须是MicrosoftWebDriver.exe)
        # options.use_chromium = True

        # 浏览器可执行文件绝对路径 - 手动指定使用的浏览器位置
        options.binary_location = r"MicrosoftWebDriver.exe"

        # options.add_argument("--remote-debugging-port=59692")

        # # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
        # options.headless = True
        # options.add_argument("--headless")

        # “–no-sandbox”参数是让Chrome在root权限下跑
        # options.add_argument('--no-sandbox')

        # options.add_argument('--disable-dev-shm-usage')

        # 谷歌文档提到需要加上这个属性来规避bug
        # options.add_argument("disable-gpu")

        # 隐私模式
        # options.add_argument("-inprivate")

        options.add_argument(
            "user-data-dir=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data"
        )

        options.add_argument(
            "profile-directory=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default"
        )

        # options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"

        self.driver = Edge(options=options)

        # give time to login manually
        self.driver.get('https://i.cnblogs.com/posts?pageSize=100')
        time.sleep(30)

        spider.logger.info('Spider opened: %s' % spider.name)

    def spider_closed(self, spider):
        self.driver.close()

        spider.logger.info('Spider opened: %s' % spider.name)

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                # cookies=self.cn_blogs_cookie,
                callback=self.parse_list_api)

    def parse_list_api(self, response):
        post_list_json_obj = json.loads(response.xpath('//pre/text()').get())
        posts_count = math.ceil(post_list_json_obj['postsCount'] /
                                self.page_size)
        for i in range(posts_count):
            try:
                post_list_page_url = 'https://i.cnblogs.com/posts?pageSize=' \
                                     + str(self.page_size) + '&page=' + str(i + 1)
                yield scrapy.Request(
                    post_list_page_url,
                    # cookies=self.cn_blogs_cookie,
                    callback=self.parse_list_page)
            except (IndexError, TypeError):
                continue

    def parse_list_page(self, response):
        post_list_items = response.xpath('//a[contains(@class,"entry")]')
        for postListItem in post_list_items:
            try:
                view_page_url = postListItem.xpath('@href').extract_first()
                if view_page_url:
                    if view_page_url.index('//') == 0:
                        view_page_url = 'http:' + view_page_url

                    yield scrapy.Request(
                        view_page_url,
                        # cookies=self.cn_blogs_cookie,
                        callback=self.parse_view_page)
            except (IndexError, TypeError):
                continue

    def parse_view_page(self, response):
        md_page_url_node = response.xpath('//a[text()=\'MD\']')
        if md_page_url_node:
            md_page_url = md_page_url_node.xpath('@href').extract_first()
            post_title = response.xpath(
                '//a[@id="cb_post_title_url"]/span/text()').get()
            if md_page_url:
                yield scrapy.Request(
                    md_page_url,
                    callback=self.parse_md_page,
                    # cookies=self.cn_blogs_cookie,
                    meta={'item': post_title})

    def parse_md_page(self, response):
        post_title = response.meta['item']
        post_content = response.xpath('//pre').xpath('text()').extract_first()
        if post_content[0:2] == '\r\n':
            post_content = post_content[2:len(post_content)]
        item = CnblogsPostItem()
        item['title'] = post_title
        item['content'] = post_content
        yield item