Beispiel #1
0
def scrapy_books(url):
    # Opção 1) Instala o driver do Chrome para o selenium
    # A opção 1 não funciona em alguns computadores.
    # Você saberá se funcionou caso a apareça uma janela do Chrome em branco
    driver = webdriver.Chrome(ChromeDriverManager().install())
    # Opção 2) Usar o driver do firefox, o geckodriver.
    # Para instalar no windows:
    # 1) Baixe o arquivo para Windows (32 ou 64bits) de https://github.com/mozilla/geckodriver/releases
    # 2) Descompacte o arquivo;
    # 3) Adicione a pasta do arquivo executável na variavel de ambiente PATH.
    #    Veja como adicionar https://knowledge.autodesk.com/pt-br/support/navisworks-products/troubleshooting/caas/sfdcarticles/sfdcarticles/PTB/Adding-folder-path-to-Windows-PATH-environment-variable.html
    # driver = webdriver.Firefox()
    driver.get(url)
    to_continue = True
    whole_dataset = []
    while to_continue:
        current_items = get_page_data(driver)
        whole_dataset.extend(current_items)
        try:
            next_button = driver.find_element_by_xpath('//li[@class="next"]/a')
        except NoSuchElementException:
            pass
            break
        next_button.click()
        wait_element(driver, '//img[@class="thumbnail"]', by=By.XPATH)
    driver.close()
    return whole_dataset
def scrapy_students(user,
                    password,
                    url=slack_channel_url,
                    exclude_list=None,
                    no_window=True):
    import os.path

    if os.path.isfile(path_to_firefoxdirver):
        options = webdriver.FirefoxOptions()  #FirefoxOptions()
        if no_window:
            options.add_argument('headless')
        driver = webdriver.Firefox(firefox_options=options,
                                   executable_path=path_to_firefoxdirver)
    elif os.path.isfile(path_to_chromedriver):
        options = webdriver.ChromeOptions()  #FirefoxOptions()
        if no_window:
            options.add_argument('headless')
        driver = webdriver.Chrome(chrome_options=options,
                                  executable_path=path_to_chromedriver)
    else:
        print("error: no driver found")
        return None

    driver.get(url)
    result = {}

    email_input = driver.find_element_by_xpath('//input[@id="email"]')
    email_input.send_keys(user)
    password_input = driver.find_element_by_xpath('//input[@id="password"]')
    password_input.send_keys(password)
    button = driver.find_element_by_xpath('//button[@id="signin_btn"]')
    button.click()
    wait_element(driver,
                 id='//a[@data-qa-channel-sidebar-channel-type="im"]',
                 tag=By.XPATH,
                 to_sleep=5)
    tree_items = driver.find_elements_by_xpath(
        '//a[@data-qa-channel-sidebar-channel-type="im"]')
    students = []
    today = (datetime.datetime.today() +
             datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    for i in tree_items:
        name = i.text.replace('(you)', '')
        if i.find_elements_by_tag_name('i')[1].get_attribute(
                'title') == "Active":
            students.append({'nome': name, today: 'Ativo'})
        else:
            students.append({'nome': name, today: 'Inativo'})

    students = [s for s in students if s['nome'] not in exclude_list]
    driver.close()
    return students
def scrapy_books(url):
    driver = webdriver.Firefox()
    driver.get(url)
    to_continue = True
    whole_dataset = []
    while to_continue:
        current_items = get_page_data(driver)
        whole_dataset.extend(current_items)
        try:
            next_button = driver.find_element_by_xpath('//li[@class="next"]/a')
        except NoSuchElementException:
            pass
            break
        next_button.click()
        wait_element(driver, '//img[@class="thumbnail"]', by=By.XPATH)
    driver.close()
    return whole_dataset
Beispiel #4
0
def get_page_data(driver):
    a_tags = driver.find_elements_by_xpath('//article/h3/a')
    dataset = []
    current_page_driver = webdriver.Firefox()
    for a in a_tags:
        href = a.get_attribute('href')
        current_page_driver.get(href)
        wait_element(current_page_driver, '//tr/td', by=By.XPATH)
        product_main = current_page_driver.find_element_by_xpath(
            '//div[contains(@class,"product_main")]')
        product_main_text = product_main.text.split('\n')
        title = product_main_text[0]
        price = product_main_text[1]
        stock = re.findall('\d+', product_main_text[2])
        product_main_ps = product_main.find_elements_by_tag_name('p')
        stars_colors = [
            x.value_of_css_property("color")
            for x in product_main_ps[2].find_elements_by_tag_name('i')
        ]
        stars = stars_colors.count('rgb(230, 206, 49)')
        description = current_page_driver.find_element_by_xpath(
            '//article/p').text
        tds = current_page_driver.find_elements_by_xpath('//tr/td')
        upc = tds[0].text
        type = tds[1].text
        price_exc_tax = tds[2].text
        price_inc_tax = tds[3].text
        tax = tds[4].text
        nreviews = tds[6].text
        record = {
            'title': title,
            'price': price,
            'stars': stars,
            'description': description,
            'stock': stock,
            'upc': upc,
            'type': type,
            'price_exc_tax': price_exc_tax,
            'price_inc_tax': price_inc_tax,
            'tax': tax,
            'nreviews': nreviews
        }
        dataset.append(record)
    current_page_driver.close()
    return dataset
Beispiel #5
0
def scrapy_datasus(value, download_path):
    url = 'http://tabnet.datasus.gov.br/cgi/deftohtm.exe?sih/cnv/qiuf.def'
    # options = webdriver.FirefoxOptions()
    # options.add_argument("download.default_directory=~/Downloads")  # Set the download Path
    # driver = webdriver.Firefox(options=options)
    options = webdriver.ChromeOptions()
    options.add_argument("download.default_directory={0}".format(
        download_path))  # Set the download Path
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    driver.find_elements_by_xpath("//select[@id='L']/option")[2].click()
    options_I = driver.find_elements_by_xpath("//select[@id='I']/option")
    options_I[0].click()
    for o in options_I:
        o.click()

    driver.find_element_by_xpath("//label[@for='S4']").find_element_by_xpath(
        '../img').click()
    driver.find_element_by_xpath(
        "//select[@id='S4']/option[@value='{0}']".format(value)).click()
    options_A = driver.find_elements_by_xpath("//select[@id='A']/option")
    n_months = len(options_A)
    options_A[0].click()

    for n in range(n_months):
        if n > 0:
            options_A[n - 1].click()
        options_A[n].click()
        driver.find_elements_by_xpath("//input[@id='F']")[1].click()
        driver.find_element_by_xpath("//input[@type='submit']").click()
        wait_element(driver, '//tr/td/a', by=By.XPATH)
        sleep(2)
        buttons = driver.find_elements_by_xpath("//tr/td/a")
        current_nfiles = len(
            fnmatch.filter(os.listdir(download_path), '*.{0}'.format('csv')))
        buttons[0].click()
        wait_download(download_path, 'csv', current_nfiles)
        sleep(1)
        buttons[-1].click()
        wait_element(driver, "//select[@id='A']/option", by=By.XPATH)
        options_A = driver.find_elements_by_xpath("//select[@id='A']/option")
def scrapy_forbes(url):
    driver = webdriver.Firefox()
    driver.get(url)
    print(driver)
    to_continue = True
    # Espera a página carregar. Esperando pelo ID da propaganda
    wait_element(driver, 'piano-wrapper', by=By.ID)
    # Remove o elemento de propaganda sobreposto à página
    remove_element(driver, driver.find_element_by_id('piano-wrapper'))
    # Muda a paginação para 100
    wait_element(driver, '//option[@value="100"]', by=By.XPATH)
    o100 = driver.find_element_by_xpath('//option[@value="100"]')
    o100.click()

    whole_dataset = []
    while to_continue:
        current_items = get_page_data(driver)
        whole_dataset.extend(current_items)
        try:
            driver.find_element_by_xpath('//div[@class="-next"]/button[@disabled]')
            break
        except NoSuchElementException:
            pass

        next_button = driver.find_element_by_xpath('//div[@class="-next"]/button')
        next_button.click()
        wait_element(driver, '//div[@role="row"]', by=By.XPATH)
    driver.close()
    return whole_dataset
from scrapy.auth_data import user_github, password_github
from scrapy.util import wait_element

url = 'https://jupyter.enap.gov.br/'
driver = webdriver.Firefox()
driver.get(url)
driver.find_element_by_xpath('//div[@class="service-login"]').click()
driver.find_element_by_xpath('//input[@id="login_field"]').send_keys(
    user_github)
driver.find_element_by_xpath('//input[@id="password"]').send_keys(
    password_github)

button_signin = driver.find_element_by_xpath('//input[@type="submit"]')
button_signin.click()
print(driver)
url_aula1 = 'https://jupyter.enap.gov.br/user/alexlopespereira/notebooks/bootcamp/Aula2/Aula2_Exercicios.ipynb'
wait_element(driver, '//input[@type="checkbox"]', by=By.XPATH)
driver.get(url_aula1)
wait_element(driver, '//div[@class="input_area"]', by=By.XPATH)
div_area = driver.find_elements_by_xpath('//div[@class="prompt_container"]')[0]
while True:
    div_area.click()
    driver.find_element_by_xpath('//button[@title="Run"]').click()
    sleep(5 * 60)
    print('running now at {0}'.format(datetime.datetime.now()))
    with open("./log.txt", "a") as file_object:
        file_object.write('running now at {0}\n'.format(
            datetime.datetime.now()))

driver.close()