Ejemplo n.º 1
0
    def __init_browser(self):
        """ configure the web driver """
        self.__options = webdriver.FirefoxOptions()
        self.__options.headless = True
        self.__options.accept_insecure_certs = True
        self.__geckodriver_binary = self.__args.geckodriver
        self.__firefox_binary = FirefoxBinary(self.__args.firefox)
        # Set firefox profile
        self.__profile = webdriver.FirefoxProfile()
        firefox_profile(self.__profile)
        if self.__browser is not None:
            self.__browser.close()

        if self.__args.console:
            self.__browser = webdriver.Firefox(
                options=self.__options,
                firefox_binary=self.__firefox_binary,
                firefox_profile=self.__profile,
                executable_path=self.__geckodriver_binary,
                log_path=os.path.devnull)
            self.__browser.set_window_size(1920, 1080)
        else:
            self.__browser = webdriver.Firefox(
                options=self.__options,
                firefox_binary=self.__firefox_binary,
                firefox_profile=self.__profile,
                executable_path=self.__geckodriver_binary,
                log_path=self.__args.log)
            self.__browser.set_window_size(1920, 1080)
        self.__wait = WebDriverWait(self.__browser, self.__args.timeout)
Ejemplo n.º 2
0
    def build(cfg, fetch_driver=True):
        """
        builds a selenium-webdriver object with the specified configuration
        :param cfg: Configuration object
        :param fetch_driver: bool (default=True) fetches driver binaries
        :return: selenium-wire Webdriver object
        """
        if cfg.proxy is not None:
            options = cfg.proxy.create_options()
        else:
            options = {}

        if cfg.driver in WebDriver.FIREFOX_DRIVER_NAMES:
            d = webdriver.Firefox
            o = webdriver.FirefoxOptions()
            if cfg.profile is None:
                p = webdriver.FirefoxProfile()
            else:
                p = webdriver.FirefoxProfile(cfg.profile)
            p.set_preference("general.useragent.override", cfg.user_agent)
            p.set_preference("media.volume_scale", "0.0")
            '''
            if cfg.proxy is not None:
                p = cfg.proxy.update_preferences(p)
            '''
        elif cfg.driver in WebDriver.CHROME_DRIVER_NAMES:
            d = webdriver.Chrome
            o = webdriver.ChromeOptions()
            o.add_argument("user-agent={0}".format(cfg.user_agent))
            if cfg.user_data_dir:
                o.add_argument("user-data-dir={0}".format(cfg.user_data_dir))
            '''
            if cfg.proxy is not None:
                o.add_argument("--proxy-server={0}".format(cfg.proxy.for_chrome()))
            '''
            p = None
        else:
            raise NotImplementedError

        if fetch_driver:
            Loader.fetch(cfg.executable_path, cfg.debug, cfg.driver)

        o.binary_location = cfg.executable_path
        o.headless = cfg.headless

        if cfg.driver in WebDriver.FIREFOX_DRIVER_NAMES:
            if cfg.proxy is None:
                return d(p, cfg.binary, options=o)
            else:
                '''return d(p, cfg.binary, options=o, proxy=cfg.proxy, seleniumwire_options=options)'''
                return d(p,
                         cfg.binary,
                         options=o,
                         seleniumwire_options=options)
        elif cfg.driver in WebDriver.CHROME_DRIVER_NAMES:
            if cfg.proxy is None:
                return d(options=o)
            else:
                '''return d(options=o, proxy=cfg.proxy, seleniumwire_options=options)'''
                return d(options=o, seleniumwire_options=options)
Ejemplo n.º 3
0
def get_driver():
    # Driver options
    options = webdriver.FirefoxOptions()
    options.set_preference("general.useragent.override", user_agent)
    options.set_preference("dom.webdriver.enabled", False)
    # options.headless = True

    return webdriver.Firefox(executable_path='path_to_geckodriver', options=options)
Ejemplo n.º 4
0
def launch_browser(headers=None,
                   user_agent=None,
                   proxy=None,
                   browser_type="Firefox"):
    options = {}
    if proxy:
        proxy = {
            "http": proxy,
            "https": proxy,
        }
        options["proxy"] = proxy
    if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
        directory = sys._MEIPASS
    else:
        directory = os.path.dirname(__file__)
    driver = None
    if browser_type == "Firefox":
        matches = ["geckodriver.exe", "geckodriver"]
        driver_paths = list(
            map(lambda match: os.path.join(directory, match), matches))
        found_paths = [
            driver_path for driver_path in driver_paths
            if os.path.exists(driver_path)
        ]
        if found_paths:
            driver_path = found_paths[0]
            opts = webdriver.FirefoxOptions()
            # opts.add_argument("--headless")
            profile = webdriver.FirefoxProfile()
            if not user_agent:
                user_agent = generate_user_agent()
            profile.set_preference("general.useragent.override", user_agent)
            driver = webdriver.Firefox(
                firefox_profile=profile,
                executable_path=driver_path,
                options=opts,
                seleniumwire_options=options,
            )
        else:
            message = f"Download geckodriver from https://github.com/mozilla/geckodriver/releases/tag/v0.27.0 and paste it in {directory}"
            input(message)
    else:
        driver_path = os.path.join(directory, "chromedriver.exe")
        opts = webdriver.ChromeOptions()
        opts.add_argument(f"--proxy-server={opts}")
        driver = webdriver.Chrome(executable_path=driver_path,
                                  options=opts,
                                  seleniumwire_options=options)
    if not driver:
        input("DRIVER NOT FOUND")
        exit(0)
    driver.set_window_size(1920, 1080)
    browser = driver
    if headers:
        browser._client.set_header_overrides(headers=headers)
    return browser
Ejemplo n.º 5
0
def get_firefox_options(heroku=False):
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")
    options.add_argument("window-size=500x1024")
    options.add_argument(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"
    )

    if heroku:
        # HEROKU
        """options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--no-sandbox")
        driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), options=options)"""
        pass
    return options
def get_driver(config_params):
    """
    create a new hidden FireFox driver with proxy
    :return: driver
    """

    options = webdriver.FirefoxOptions()

    options.headless = True  # make it hidden
    profile = webdriver.FirefoxProfile()
    proxy_host = config_params['proxy_host']
    proxy_port = config_params['proxy_port']
    profile.set_preference('network.proxy.https', proxy_host)
    profile.set_preference('network.proxy.https_port', proxy_port)

    created_driver = webdriver.Firefox(firefox_options=options,
                                       firefox_profile=profile)
    created_driver.implicitly_wait(7)

    return created_driver
Ejemplo n.º 7
0
    def __init__(self,
                 proxy=None,
                 headless=True,
                 wait_increment=WAIT_INCREMENT,
                 id=None):
        # set the sleep increment
        self.wait_increment = wait_increment

        # associate it with a streamer
        self.id = id

        # create a webdriver to work with
        options = webdriver.FirefoxOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--window-size=1420,1080')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-notifications')
        options.add_argument('--dns-prefetch-disable')
        options.add_argument('--disable-dev-shm-usage')

        if headless:
            options.add_argument('--headless')

        # add a proxy if available
        prox_options = None
        if proxy:
            my_proxy = f"{proxy['username']}:{proxy['password']}@{proxy['host']}:{proxy['port']}"

            # authenticated
            prox_options = {
                'proxy': {
                    'http': f"http://{my_proxy}",
                    'https': f"https://{my_proxy}",
                    'no_proxy': 'localhost,127.0.0.1,dev_server:8080'
                }
            }

        self.driver = webdriver.Firefox(options=options,
                                        seleniumwire_options=prox_options)
Ejemplo n.º 8
0
    def init_driver(self, driver_path):
        seleniumwire_options = {}

        # _proxy = get_proxy()
        _proxy = None
        if _proxy:
            seleniumwire_options.update(
                {'proxy': {
                    'http': _proxy,
                    'https': _proxy,
                    'no_proxy': ''
                }})

        options = webdriver.FirefoxOptions()
        options.headless = config.headless  # True - окно скрыто, False - окно не скрыто

        self.driver = webdriver.Firefox(
            options=options,
            seleniumwire_options=seleniumwire_options,
            executable_path=driver_path)

        self.driver.set_window_position(0,
                                        0)  # ставим окно в левый верхний угол
        self.driver.set_window_size(100,
                                    300)  # устанавливаем фиксированный размер

        self.driver.install_addon(
            os.path.abspath('extensions/anticaptcha-plugin_v0.52.xpi')
        )  # устанавливаем плагин который решает капчу

        # вставляем ключ в решатель капчи
        self.driver.get('https://antcpt.com/blank.html')

        acp_api_send_request(
            self.driver, 'setOptions',
            {'options': {
                'antiCaptchaApiKey': config.CAPTCHA_KEY
            }})
def login():
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    global driver
    driver = webdriver.Firefox(options=options)
    url = 'https://www.instacart.com'
    driver.get(url)
    request_cookies_browser = driver.get_cookies()
    data = {
        "user": {
            "email": "*****@*****.**",
            "password": "******"
        },
        "authenticity_token": ""
    }
    headers = {
        'user-agent': 'Mozilla/5.0',
        'x-requested-with': 'XMLHttpRequest'
    }
    s = requests.Session()
    c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser]
    res = s.get('https://www.instacart.com/',
                headers={'user-agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(res.text, 'lxml')
    token = soup.select_one("[name='csrf-token']").get('content')
    data["authenticity_token"] = token
    resp = s.post("https://www.instacart.com/accounts/login",
                  json=data,
                  headers=headers)

    dict_resp_cookies = resp.cookies.get_dict()
    response_cookies_browser = [{
        'name': name,
        'value': value
    } for name, value in dict_resp_cookies.items()]
    c = [driver.add_cookie(c) for c in response_cookies_browser]
Ejemplo n.º 10
0
    data_dir = os.path.join(BASE_DIR, 'data', 'taipei_shop_rent_price')
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    data_info = DataInfo(data_info_path)

    download_dirpath = data_info.get_download_dirpath()
    main_xhr_response_filepath = os.path.join(download_dirpath, '591_xhr_responses.json')
    output_filename = '591_lat_long_lookup.json'
    output_filepath = os.path.join(download_dirpath, output_filename)

    list_post_id = get_listing_list_id(main_xhr_response_filepath)

    # set webdriver, request interceptor scope, and wait object
    print("note: this scrapping will take hours (there are some brakes "
        "to respect the website). The program heavily depend on your internet connection")
    print("INFO: setup crawler, use Firefox driver")
    webdriver_options = webdriver.FirefoxOptions()
    if option == 'hide':
        webdriver_options.headless = True
    elif option == 'show':
        webdriver_options.headless = False

    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(),
                               options=webdriver_options)
    driver.set_page_load_timeout(60)
    url_regex = '.*maps\.google\.com\.tw\/maps?.*'
    driver.scopes = [url_regex]
    start_url = 'https://www.591.com.tw/'

    web_explorer = WebExplorer591(driver, start_url)
    check_page(driver)
Ejemplo n.º 11
0
def bot(id):
    global args, locks, urls, user_agents, referers, proxies, drivers, watched_videos
    while True:
        try:
            url = choice(urls)
            with locks[0]:
                if len(proxies) == 0:
                    proxies.extend(get_proxies())
                proxy = choice(proxies)
                proxies.remove(proxy)
            log('[INFO][%d] Connecting to %s' % (id, proxy))
            user_agent = choice(
                user_agents) if args.user_agent else user_agents(
                    os=('win', 'android'))
            log('[INFO][%d] Setting user agent to %s' % (id, user_agent))
            if args.slow_start:
                locks[1].acquire()
            if system() == 'Windows':
                executable_dir = path_join(environ['APPDATA'], 'DeBos',
                                           'drivers')
            else:
                executable_dir = path_join(environ['HOME'], '.DeBos',
                                           'drivers')
            seleniumwire_options = {
                'proxy': {
                    'http': 'http://%s' % proxy,
                    'https': 'https://%s' % proxy,
                    'no_proxy': 'localhost,127.0.0.1'
                }
            }
            if args.driver == 'chrome':
                chrome_options = webdriver.ChromeOptions()
                chrome_options.add_argument(
                    '--user-agent={}'.format(user_agent))
                chrome_options.add_argument('--mute-audio')
                chrome_options.add_experimental_option('excludeSwitches',
                                                       ['enable-logging'])
                if args.headless:
                    chrome_options.add_argument('--headless')
                if is_root():
                    chrome_options.add_argument('--no-sandbox')
                if system() == 'Windows':
                    executable_path = path_join(executable_dir,
                                                'chromedriver.exe')
                else:
                    executable_path = path_join(executable_dir, 'chromedriver')
                driver = webdriver.Chrome(
                    options=chrome_options,
                    seleniumwire_options=seleniumwire_options,
                    executable_path=executable_path)
            else:
                firefox_options = webdriver.FirefoxOptions()
                firefox_options.preferences.update({
                    'media.volume_scale':
                    '0.0',
                    'general.useragent.override':
                    user_agent
                })
                if args.headless:
                    firefox_options.add_argument('--headless')
                if system() == 'Windows':
                    executable_path = path_join(executable_dir,
                                                'geckodriver.exe')
                else:
                    executable_path = path_join(executable_dir, 'geckodriver')
                driver = webdriver.Firefox(
                    options=firefox_options,
                    seleniumwire_options=seleniumwire_options,
                    service_log_path=devnull,
                    executable_path=executable_path)
            driver.header_overrides = {'Referer': choice(referers)}
            process = driver.service.process
            pid = process.pid
            cpids = [x.pid for x in Process(pid).children()]
            pids = [pid] + cpids
            drivers.extend(pids)
            if args.slow_start:
                locks[1].release()
            log('[INFO][%d] Successully started webdriver!' % id)
            driver.set_page_load_timeout(45)
            log('[INFO][%d] Opening %s' % (id, url))
            driver.get(url)
            if driver.title.endswith('YouTube'):
                log('[INFO][%d] Video successfully loaded!' % id)
                try:
                    WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable(
                            (By.CLASS_NAME, 'ytp-large-play-button'))).click()
                except:
                    pass
                if args.duration:
                    sleep(args.duration)
                else:
                    video = WebDriverWait(driver, 3).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, 'html5-main-video')))
                    video_duration = driver.execute_script(
                        'return arguments[0].getDuration()', video)
                    sleep(float(video_duration) * uniform(0.35, 0.85))
                log('[INFO][%d] Video successfully viewed!' % id)
                if not args.verbose:
                    watched_videos += 1
            else:
                log('[INFO][%d] Dead proxy eliminated!' % id)
        except WebDriverException as e:
            log('[WARNING][%d] %s' % (id, e.__class__.__name__))
        except NoSuchProcess:
            log('[WARNING][%d] NoSuchProcess' % id)
        except KeyboardInterrupt:
            exit(0)
        except:
            exit(1)
        finally:
            log('[INFO][%d] Quitting webdriver!' % id)
            try:
                driver
            except NameError:
                pass
            else:
                driver.quit()
            with locks[2]:
                try:
                    pids
                except NameError:
                    pass
                else:
                    for pid in pids:
                        try:
                            drivers.remove(pid)
                        except:
                            pass
Ejemplo n.º 12
0
def bot(id):
    global args, locks, urls, user_agents, referers, proxies, drivers, watched_ads
    while True:
        try:
            url = choice(urls)
            with locks[0]:
                if len(proxies) == 0:
                    proxies.extend(get_proxies())
                proxy = choice(proxies)
                proxies.remove(proxy)
            log('[INFO][%d] Connecting to %s' % (id, proxy))
            user_agent = choice(
                user_agents) if args.user_agent else user_agents()
            log('[INFO][%d] Setting user agent to %s' % (id, user_agent))
            if args.slow_start:
                locks[1].acquire()
            if system() == 'Windows':
                executable_dir = path_join(environ['APPDATA'], 'DeBos',
                                           'drivers')
            else:
                executable_dir = path_join(environ['HOME'], '.DeBos',
                                           'drivers')
            seleniumwire_options = {
                'proxy': {
                    'http': 'http://%s' % proxy,
                    'https': 'https://%s' % proxy,
                    'no_proxy': 'localhost,127.0.0.1'
                }
            }
            if args.driver == 'chrome':
                chrome_options = webdriver.ChromeOptions()
                chrome_options.add_argument(
                    '--user-agent={}'.format(user_agent))
                chrome_options.add_argument('--mute-audio')
                chrome_options.add_argument("--disable-extensions")
                chrome_options.add_argument("--disable-gpu")
                chrome_options.add_argument("--disable-dev-shm-usage")
                chrome_options.add_argument("--no-sandbox")
                chrome_options.add_experimental_option('excludeSwitches',
                                                       ['enable-logging'])
                if args.headless:
                    chrome_options.add_argument('--headless')
                if is_root():
                    chrome_options.add_argument('--no-sandbox')
                if system() == 'Windows':
                    executable_path = path_join(executable_dir,
                                                'chromedriver.exe')
                else:
                    executable_path = path_join(executable_dir, 'chromedriver')
                driver = webdriver.Chrome(
                    options=chrome_options,
                    seleniumwire_options=seleniumwire_options,
                    executable_path=executable_path)
            else:
                firefox_options = webdriver.FirefoxOptions()
                firefox_options.preferences.update({
                    'media.volume_scale':
                    '0.0',
                    'general.useragent.override':
                    user_agent
                })
                if args.headless:
                    firefox_options.add_argument('--headless')
                if system() == 'Windows':
                    executable_path = path_join(executable_dir,
                                                'geckodriver.exe')
                else:
                    executable_path = path_join(executable_dir, 'geckodriver')
                driver = webdriver.Firefox(
                    options=firefox_options,
                    seleniumwire_options=seleniumwire_options,
                    service_log_path=devnull,
                    executable_path=executable_path)
            driver.header_overrides = {'Referer': choice(referers)}
            process = driver.service.process
            pid = process.pid
            cpids = [x.pid for x in Process(pid).children()]
            pids = [pid] + cpids
            drivers.extend(pids)
            if args.slow_start:
                locks[1].release()
            log('[INFO][%d] Successully started webdriver!' % id)
            driver.set_page_load_timeout(60)
            log('[INFO][%d] Opening %s' % (id, url))
            driver.get(url)
            if driver.title == 'Shrink your URLs and get paid!':
                log('[INFO][%d] Website successfully loaded!' % id)
                WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable(
                        (By.ID, 'skip_bu2tton'))).click()
                log('[INFO][%d] Ad successfully viewed!' % id)
                if not args.verbose:
                    watched_ads += 1
            else:
                log('[WARNING][%d] Dead proxy eliminated!' % id)
        except WebDriverException as e:
            log('[WARNING][%d] %s' % (id, e.__class__.__name__))
        except KeyboardInterrupt:
            exit(0)
        except:
            exit(1)
        finally:
            log('[INFO][%d] Quitting webdriver!' % id)
            try:
                driver
            except NameError:
                pass
            else:
                driver.quit()
            with locks[2]:
                try:
                    pids
                except NameError:
                    pass
                else:
                    for pid in pids:
                        try:
                            drivers.remove(pid)
                        except:
                            pass
Ejemplo n.º 13
0
def csdn(url):
    print('CSDN')
    option = webdriver.FirefoxOptions()

    time.sleep(3)
    driver = webdriver.Firefox(firefox_options=option)
    driver.implicitly_wait(15)
    driver.header_overrides = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':
        'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Cache-Control': 'max-age=0',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
        'Connection': 'keep-alive',
        'Referer': 'https://blog.csdn.net/haibo0668/article/details/80025077'
    }
    driver.get(url)
    print("a1")
    cookie_ori = driver.get_cookies()
    print(cookie_ori)
    print(type(cookie_ori))
    print(len(cookie_ori))
    cookie_ori_len = len(cookie_ori)
    l = []
    for i in range(cookie_ori_len):
        # print(cookie_ori[i]['name'])
        name = cookie_ori[i]['name']
        # print(cookie_ori[i]['value'])
        value = cookie_ori[i]['value']
        t = (name, value)
        print(t)
        l.append(t)
    print(l)
    options = {
        'encoding':
        'UTF-8',
        'custom-header':
        [('Accept', '*/*'), ('Accept-Language', 'zh-CN,zh;q=0.9'),
         ('Cache-Control', 'max-age=0'),
         ('User-Agent',
          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
          ), ('Connection', 'keep-alive'),
         ('Referer',
          'https://blog.csdn.net/haibo0668/article/details/80025077'),
         ('Accept-Encoding', 'gzip, deflate, br'),
         ('Host', 'img-blog.csdn.net')],
        'cookie':
        l
    }
    locator = (By.ID, "btn-readmore")
    randomInt = random.randint(0, 10)
    fileName = 'pdffile' + str(randomInt) + '.pdf'
    print('filename + ' + str(fileName))
    try:
        WebDriverWait(driver, 15,
                      0.5).until(EC.presence_of_element_located(locator))
        print('阅读更多按钮找到')
    except Exception as e:
        print(e)
        print("等待错了")
        pass
    try:
        time.sleep(2)
        print("开始爬取网页")
        logging.warning("开始爬取网页")
        time.sleep(3)
        html = driver.page_source
        print(type(html))
        print(html)
        time.sleep(2)
        print("找阅读更多按钮")
        try:
            target = driver.find_element_by_id("btn-readmore")
            driver.execute_script("arguments[0].scrollIntoView();", target)
            target.click()
            print("找阅读更多按钮OK")
            html = driver.page_source
            print(type(html))
        except Exception as e:
            print(e)
        try:
            time.sleep(1)
            print("找main Body")
            content_div = etree.HTML(html).xpath(
                '//div[@class="blog-content-box"]')[0]
            content_byte = etree.tostring(content_div)
            content_str = bytes.decode(content_byte)
            html = content_str
            print("找main Body OK")
            time.sleep(1)
        except Exception as e:
            print(e)
        print('test' + str(fileName))
        pdfkit.from_string(html, fileName, options=options)
        print(fileName)

    except Exception as e:
        print(e)
        print("大概率按钮没找到")
        try:
            print('test' + str(fileName))
            print("pdf2")
            pdfkit.from_string(html, fileName, options=options)
            print(fileName)
            print("pdf")
        except Exception as e:
            print(e)
    print("OK4")
    driver.quit()
Ejemplo n.º 14
0
def generate_webdriver(headless=False, log_file=None):
    options = webdriver.FirefoxOptions()
    if headless:
        log_info("Started in headless mode", log_file=log_file)
        options.add_argument("-headless")
    return webdriver.Firefox(firefox_options=options)