Beispiel #1
0
def main():
    data_file_path = os.path.abspath(DATA_FILE)
    if os.path.exists(data_file_path):
        print("Data for {} already downloaded!".format(BASE_URL))
        sys.exit(1)

    print("Initialization...")
    options = selenium.webdriver.chrome.options.Options()
    options.add_argument("--headless")
    driver = selenium.webdriver.Chrome(options=options)
    driver.get(BASE_URL)

    header = driver.find_element_by_tag_name("header")
    links_to_geosites = header.find_elements_by_tag_name("a")
    links_to_geosites = [
        item.get_attribute("href") for item in links_to_geosites
    ]

    print("Scraping {:d} geolocated sites...".format(len(links_to_geosites)))
    startups = []
    for url in links_to_geosites:
        print("Fetching {}...".format(url))
        startups_from_geosite = scrape_geosite(driver, url)
        print("+ {:d} startup names".format(len(startups_from_geosite)))
        startups.extend(startups_from_geosite)
        time.sleep(1)
    for idx, item in enumerate(startups):
        startups[idx] = item.strip()

    print("Scraped {:d} startup names".format(len(startups)))
    data_content = "\n".join(startups)
    with open(data_file_path, "w", encoding="utf-8") as fp:
        fp.write(data_content)
    print("Result saved to {}".format(data_file_path))
Beispiel #2
0
def find_driver():
    #prefs = {"profile.managed_default_content_settings.images": 2}  #设置无图模式
    options = webdriver.ChromeOptions()
    #options.add_experimental_option("prefs", prefs)
    options.add_argument("--lang=en")
    #options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-logging"])
    options.add_argument('ignore-certificate-errors')

    driver = webdriver.Chrome(
        options=options,
        executable_path=
        'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
    return driver
Beispiel #3
0
def get_browser(headless):
    if headless:
        options = selenium.webdriver.chrome.options.Options()
        options.set_headless(True)
        # Disabling scroll bars is important, see
        # https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487.
        options.add_argument("--hide-scrollbars")
        # The Chrome binary is at a nonstandard location on Heroku,
        # see [1].
        #
        # [1]: https://github.com/heroku/heroku-buildpack-google-chrome.
        binary = os.environ.get("GOOGLE_CHROME_SHIM")
        if binary:
            options.binary_location = binary
        return selenium.webdriver.Chrome(chrome_options=options)
    else:
        return selenium.webdriver.Chrome()
Beispiel #4
0
def get_browser():
    """
    Return a Selenium browser object. Whether it is headless is
    controlled by the 'headless' config var.
    """
    if util.get_env_boolean("headless"):
        options = selenium.webdriver.chrome.options.Options()
        options.headless = True
        # Disabling scroll bars is important, see
        # <https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487>.
        options.add_argument("--hide-scrollbars")
        # The Chrome binary is at a nonstandard location on Heroku,
        # see <https://github.com/heroku/heroku-buildpack-google-chrome>.
        binary = os.environ.get("GOOGLE_CHROME_SHIM")
        if binary:
            options.binary_location = binary
        return selenium.webdriver.Chrome(options=options)
    return selenium.webdriver.Chrome()
Beispiel #5
0
    def __init__(self, headless=False, debug=False, logging_queue=None):
        self.logger = logging.getLogger("RiskExamAutomaton")
        options = selenium.webdriver.chrome.options.Options()
        self.debug = debug
        if headless:
            options.add_argument("--headless")
            options.add_argument("--window-size=1024,768")
            options.add_argument("--disable-gpu")
        self.is_headless = headless
        if self.debug:
            self.driver = webdriver.Remote(
                "http://10.3.1.181:9515",
                desired_capabilities=options.to_capabilities())
        else:
            self.driver = webdriver.Remote(
                "http://10.3.1.181:4444/wd/hub",
                desired_capabilities=options.to_capabilities())
        self.policy = exampolicy.ExamPolicy()
        self.skip_list = []

        self.init_sqlite()

        if logging_queue:
            self.logging_queue = logging_queue
        else:
            self.logging_queue = queue.Queue(-1)
        self.queue_handler = logging.handlers.QueueHandler(self.logging_queue)
        self.logger.addHandler(self.queue_handler)
Beispiel #6
0
def find_driver():
    prefs = {"profile.managed_default_content_settings.images": 2}  #设置无图模式
    options = webdriver.ChromeOptions()
    options.add_experimental_option("prefs", prefs)
    options.add_argument("--lang=en")
    options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-logging"])
    options.add_argument('ignore-certificate-errors')

    driver = webdriver.Chrome(options=options, executable_path=find_path()[0])
    return driver
Beispiel #7
0
def chrome_browser():

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # browser = webdriver.Remote(command_executor="http://localhost:4444",desired_capabilities=DesiredCapabilities.CHROME)
    browser = webdriver.Chrome(ChromeDriverManager().install(),
                               options=options)
    browser.implicitly_wait(5)
    browser.maximize_window()
    return browser
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(description='GoogleImageCrawler options',
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-s', '--sentence', default='google', type=str, help='Sentence what you want to search. '
                                                                             'Default is "google".')
    parser.add_argument('-d', '--delay', default='1', type=int, help='Sets delay for scrolling. Default is "1" '
                                                                     'second.\nBe careful to "0" will put maximum '
                                                                     'burden on the server.')
    parser.add_argument('-o', '--output-directory', default='images', type=str, help='Sets output directory. Default is '
                                                                              '"images".')
    parser.add_argument('-dh', '--do-html', default='false', type=str, help='This option will print result of img tags '
                                                                            'like html.\nIf you set true then images '
                                                                            'will not output.\nDefault is "False".')
    parser.add_argument('-ss', '--scroll-speed', default=2000, type=int, help='For advanced users!\nScroll speed per '
                                                                              'delay.\nDefault is "2000"px.')
    parser.add_argument('-gl', '--geolocation', default='', type=str, help='For advanced users!\nSets geolocation '
                                                                           'code.\nDefault is blank because Google '
                                                                           'guess it from ip or get from your account '
                                                                           'settings.\nIt can affect search '
                                                                           'results.\nCode list is here\n '
                                                                           '"https://developers.google.com/custom'
                                                                           '-search/docs/xml_results_appendices'
                                                                           '#countryCodes".')
    parser.add_argument('-it', '--image-type', default='', type=str, help='For advanced users!\nSets image type.\n'
                                                                          'Default is blank.\nValid types are '
                                                                          '"clipart", "face", "lineart", "stock", '
                                                                          '"photo", "animated".')
    parser.add_argument('-sp', '--safe-parameter', default='off', type=str, help='For advanced users!\nSets safe '
                                                                                 'parameter.\nDefault is '
                                                                                 '"off".\nValid parameters are "off", '
                                                                                 '"medium", "high".')
    argv = parser.parse_args()
    delay = argv.delay
    html = argv.do_html
    directory = argv.output_directory
    scroll_speed = argv.scroll_speed
    gl = argv.geolocation
    it = argv.image_type
    safe = argv.safe_parameter
    if not os.path.exists(directory):
        os.makedirs(directory)
    query = argv.sentence.split()
    query = '+'.join(query)
    url = 'https://www.google.co.jp/search?tbm=isch&hl=ja&q=' + urllib.parse.quote_plus(query, encoding='utf-8')
    if gl != "":
        url += '&gl=' + gl
    if it != "":
        url += '&imgType=' + it
    if safe != "":
        url += '&safe=' + safe
    print('Starting crawl at "' + url + '".')
    options = selenium.webdriver.chrome.options.Options()
    options.add_argument('--headless')
    options.add_argument('--start-maximized')
    options.add_argument('--no-sandbox')
    options.add_argument("--disable-setuid-sandbox")
    options.add_argument('--disable-extensions')
    driver = selenium.webdriver.Chrome(options=options)
    driver.get(url)
    progress_icon = queue.Queue()
    progress_icon.put('|')
    progress_icon.put('/')
    progress_icon.put('-')
    progress_icon.put('\\')
    while True:
        pi = progress_icon.get()
        progress_icon.put(pi)
        print('\r' + pi + ' ' + str(
            len(driver.find_elements_by_xpath('//img[@class="rg_i Q4LuWd"]'))) + ' images found', end='')
        time.sleep(delay)
        element = driver.find_element_by_xpath('//input[@value="結果をもっと表示"]')
        if element.is_displayed():
            element.find_element_by_xpath('//input[@value="結果をもっと表示"]').click()
        elif driver.find_element_by_xpath('//div[text()="未読はありません"]').is_displayed():
            break
        else:
            driver.execute_script('window.scrollTo(0,' + str(scroll_speed) + ');')
        scroll_speed += 2000
    soup = bs4.BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    images = soup.select('img[class="rg_i Q4LuWd"]')
    print('\rCompleted crawl\nResult:' + str(len(images)) + ' images found')
    if html != 'true' and html != "True":
        print('Starting download images')
    for index, img in enumerate(images):
        if img.get('data-src') is not None:
            img['src'] = img['data-src']
        if html != 'true' and html != "True":
            src = str(img['src'])
            pi = progress_icon.get()
            progress_icon.put(pi)
            count = str(index + 1)
            print('\r' + pi + ' ' + count + '/' + str(len(images)), end='')
            if src.find('data:image/') != 0:
                response = requests.get(src)
                image = response.content
                with open(directory + '/' + count, 'wb') as im:
                    im.write(image)
                mime = magic.Magic(mime=True)
                ext = mime.from_file(directory + '/' + count)
                os.rename(directory + '/' + count, directory + '/' + count + '.' + ext[ext.rfind('/') + 1:])
            else:
                ext = '.' + src[src.find('/') + 1:src.find(';')]
                data = src[src.find(','):]
                with open(directory + '/' + count + ext, 'wb') as im:
                    im.write(base64.b64decode(data))
    imgs = '\n'.join(map(str, images))
    if html != 'true' and html != "True":
        print('\rComplete download images')
    if html == 'true' or html == 'True':
        print('Starting print html')
        with open(query + '.html', mode='w') as f:
            f.write(imgs)
        print('Complete print')
Beispiel #9
0
def browser(config_browser, config_wait_time, request):
    # Initialize WebDriver
    if config_browser == 'chrome':
        options = Options()
        options.add_argument('log-level=3')
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-extensions")
        options.add_argument("--proxy-server='direct://'")
        options.add_argument("--proxy-bypass-list=*")
        options.add_argument("--start-maximized")
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--no-sandbox')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument(
            '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"'
        )
        # chrome_options.add_argument("--window-size=1920x1080")  # options=options
        driver = Chrome(Browser_path, options=options)
        # else:
        #     raise Exception(f'"{config_browser}" is not a supported browser')

        # Wait implicitly for elements to be ready before attempting interactions
        driver.implicitly_wait(config_wait_time)
        driver.maximize_window()

        # Return the driver object at the end of setup
        yield driver

        # For cleanup, quit the driver
        driver.quit()
            'capital-histo-description'
        ).text  # on lui dit de continuer normalement
        capital_social.append(cap_soc)

    except NoSuchElementException:
        capital_social.append('NaN')

print(capital_social)

browser.quit()

# récupération du prix moyen sur pages jaunes :

# série d'options pour notre webdriver comme navigation en mode privé, bloquer les pops ups et publicités (mais pas les cookies ...)
options = webdriver.ChromeOptions()
options.add_argument("private")
options.add_argument("--start-maximized")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-popup-blocking")
options.add_argument("--incognito")
options.add_argument("--headless")

browser = webdriver.Chrome(
    executable_path=
    "C:/Users/GUILLOT Robin/Documents/Robin Ensae/Matières/Python//chromedriver",
    options=options)

browser.get('https://www.pagesjaunes.fr/activites')

cookie = browser.find_element_by_id("didomi-notice-agree-button").click()
Beispiel #11
0

def follow_private_page():
    follow = browser.find_element_by_xpath(
        '//*[@id="react-root"]/section/main/div/header/section/div[1]/div[1]/div/div/button'
    )
    if follow.text != 'Follow':
        pass
    else:
        follow.click()


#------------------------------------------------------------------------------

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')

# -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- main -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*

browser = open_browser("https://www.instagram.com/")

username_input = type_in_box_by_name("username", user)
password_input = type_in_box_by_name("password", passw)
ClickLoginBox_ClickNotNowNotification()

search_and_select_account('mister_programmer_')
open_post(1)

click_plus_icon_load_comments()
    list_of_dates = [
        datetime.strptime(all_list[x]['buisday'], '%Y-%m-%d')
        for x in range(0, len(all_list))
    ]
    #Find the last made transaction
    latest_transactions = max(list_of_dates)
    diff = datetime.now() - latest_transactions
    # Date between the last input in all transactions
    days_back = diff.days
else:
    print('All list was empty, taking 365 days')
    days_back = 365

# Set headless
options = webdriver.ChromeOptions()
options.add_argument('headless')
#This option doesn't seem to work, as it downloads to the python folder instead.
options.add_argument(
    'download.default_directory=/Users/albinjonfelt/Documents/programmering/aktier/bin'
)
browser = webdriver.Chrome(options=options)
print("Running chrome headless")

# open the browser
browser.get('https://www.nordnet.se/se')

# login
browser.find_elements_by_class_name('sv-font-button-white')[0].click()
browser.implicitly_wait(7)
open_login_button = browser.find_element_by_xpath(
    "/html/body/div[1]/section/section[2]/section/section/section/div[2]/div/button"