Example #1
0
def copy_images_facebook_mohtt(image_dir_path):
    try:
        # open the browser
        logging.info("Now opening the Firefox browser")
        options = Options()
        options.headless = True
        options.accept_insecure_certs = True
        profile = FirefoxProfile()
        profile.set_preference('security.tls.version.enable-deprecated', True)
        driver = webdriver.Firefox(profile, options=options)
        moh_fb_url = "https://www.facebook.com/pg/MinistryofHealthTT/posts/?ref=page_internal"
        logging.info("Navigating to " + moh_fb_url)
        driver.get(moh_fb_url)
        # scroll down on the page until the first report is found
        logging.info("Now trying to scroll to find case #1.")
        report_1_found = False
        while not report_1_found:
            logging.info("First report not found. Scrolling some more.")
            # Scroll down to bottom of page
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            # click not now when facebook asks to sign in
            not_now_button = driver.find_element_by_id(
                "expanding_cta_close_button")
            if not_now_button.text == 'Not Now':
                not_now_button.click()
            # get all paragraph elements on the page
            p_elements = driver.find_elements_by_tag_name("p")
            # check the text in each p element
            for p_element in p_elements:
                if p_element.text == "#MediaRelease: COVID-19 Update #1":
                    driver.execute_script("arguments[0].scrollIntoView();",
                                          p_element)
                    report_1_found = True
                    logging.info("First report found. Stopped scrolling.")
        logging.info("Found the 1st report from the MoH.")
        logging.info("Now downloading all reports on the page.")
        # get all paragraph elements on the page
        p_elements = driver.find_elements_by_tag_name("p")
        # check the text in each p element for keywords
        keywords = ['update #', 'update no']
        for p_element in p_elements:
            if any(s in p_element.text.lower() for s in keywords):
                try:
                    logging.info("Now trying to download image from: " +
                                 p_element.text)
                    # set the filename
                    filename = p_element.text
                    # get the parent node of this element (should be a div)
                    div_parent = p_element.find_element_by_xpath("./..")
                    # the images are contained in the next div
                    following_div = div_parent.find_elements_by_xpath(
                        "following-sibling::div")
                    # then get all the a elements in the following div
                    a_elements = following_div[0].find_elements_by_tag_name(
                        "a")
                    # the image we want should be the first a_element
                    a_elements[0].click()
                    element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, "spotlight")))
                    # find the image on the page
                    spotlight_image = driver.find_element_by_class_name(
                        "spotlight").get_attribute("src")
                    # set the download path on the local machine
                    img_download_path = os.path.join(image_dir_path,
                                                     filename + ".jpg")
                    # download the image to the local machine
                    urllib.request.urlretrieve(spotlight_image,
                                               img_download_path)
                    logging.info("Successfully downloaded image.")
                    # find the close button
                    u_elements = driver.find_elements_by_tag_name("u")
                    for element in u_elements:
                        if element.text == "Close":
                            # click the close button
                            u_parent = element.find_element_by_xpath("./..")
                            u_parent.click()
                            break
                except Exception as exc:
                    logging.error("Unable to download image.")
                # click not now when facebook asks to sign in
                not_now_button = driver.find_element_by_id(
                    "expanding_cta_close_button")
                if not_now_button.text == 'Not Now':
                    not_now_button.click()
    except:
        logging.info(
            "Encountered an issue while trying to download the images.")
        raise
    else:
        logging.info(
            "Completed downloading of all images from MoH Facebook page.")
        return 0
    finally:
        if 'driver' in locals() and driver is not None:
            # Always close the browser
            driver.quit()
            logging.info("Successfully closed web browser.")
Example #2
0
from pathlib import Path

from selenium import webdriver
from selenium.webdriver import FirefoxProfile, Proxy
import os, time

from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.firefox.options import Options


p = Proxy()
p.proxy_type = ProxyType.MANUAL
p.httpProxy = "1.1.1.1:8080"

x = Options()
x.accept_insecure_certs = True

# x.headless = True
# x.proxy = p
# x.accept_insecure_certs = True
# x.set_preference("browser.download.defaultFolder", str(Path(os.getcwd()).parent) + os.path.sep +  "AutomationDownloads")

# myProxy = "86.111.144.194:3128"
# proxy = Proxy({
#     'proxyType': ProxyType.MANUAL,
#     'httpProxy': myProxy,
#     'ftpProxy': myProxy,
#     'sslProxy': myProxy,
#     'noProxy':''})

def download_pdfs():
    """Use selenium and the firefox browser to download all COVID19 reports from the 
    PAHO website"""
    try:
        # create the download folder if it does not exist already
        Path(paho_raw_reports_dir).mkdir(parents=True, exist_ok=True)
        # remove all current pdfs in the download folder
        filelist = [
            f for f in os.listdir(paho_raw_reports_dir) if f.endswith(".pdf")
        ]
        for f in filelist:
            os.remove(os.path.join(paho_raw_reports_dir, f))
        # open the browser
        logging.info("Now opening the Firefox browser")
        options = Options()
        options.headless = True
        options.accept_insecure_certs = True
        profile = FirefoxProfile()
        profile.set_preference('security.tls.version.enable-deprecated', True)
        # set the download location of the pdfs and remove the download prompt
        profile.set_preference("browser.altClickSave", True)
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.panel.shown", False)
        profile.set_preference("browser.download.manager.showWhenStarting",
                               False)
        profile.set_preference("browser.download.dir", paho_raw_reports_dir)
        profile.set_preference("browser.download.useDownloadDir", True)
        profile.set_preference(
            "browser.helperApps.neverAsk.saveToDisk",
            "application/pdf,application/x-pdf,application/octet-stream,application/x-winzip,application/x-gzip"
        )
        profile.set_preference("browser.download.manager.alertOnEXEOpen",
                               False)
        profile.set_preference("browser.download.manager.showWhenStarting",
                               False)
        profile.set_preference("browser.download.manager.focusWhenStarting",
                               False)
        profile.set_preference("browser.helperApps.alwaysAsk.force", False)
        profile.set_preference("browser.download.manager.alertOnEXEOpen",
                               False)
        profile.set_preference("browser.download.manager.closeWhenDone", True)
        profile.set_preference("browser.download.manager.showAlertOnComplete",
                               False)
        profile.set_preference("browser.download.manager.useWindow", False)
        profile.set_preference(
            "services.sync.prefs.sync.browser.download.manager.showWhenStarting",
            False)
        profile.set_preference("pdfjs.disabled", True)
        driver = webdriver.Firefox(profile, options=options)
        # Go the PAHO website that holds the reports
        reports_present_on_page = True
        page_number = 0
        pahoreporturl = "https://www.paho.org/en/technical-reports?topic=4922&d%5Bmin%5D=&d%5Bmax%5D=&page=" + str(
            page_number)
        while reports_present_on_page:
            logging.info("Navigating to " + pahoreporturl)
            driver.get(pahoreporturl)
            # get all urls containing certain keywords on this page
            report_links_elements = driver.find_elements_by_partial_link_text(
                "COVID-19 cases")
            # store all of the urls in each element
            report_links = []
            for report_link_element in report_links_elements:
                report_links.append(report_link_element.get_attribute('href'))
            # now go through each url in the list
            for report_link in report_links:
                # navigate to each url
                driver.get(report_link)
                # once the page has loaded, click the download link
                download_link = driver.find_element_by_link_text("DOWNLOAD")
                download_link.click()
                logging.info("File downloaded from: " +
                             download_link.get_attribute('href'))
                # check if we have any elements that we're interested in on this page, to control the loop
            if report_links_elements:
                reports_present_on_page = True
                page_number += 1
                pahoreporturl = "https://www.paho.org/en/technical-reports?topic=4922&d%5Bmin%5D=&d%5Bmax%5D=&page=" + str(
                    page_number)
            else:
                reports_present_on_page = False
                logging.info("No more reports on page. Breaking loop.")
        return 0
    except:
        logging.info("Encountered an issue while trying to download the pdfs.")
        raise
    finally:
        if 'driver' in locals() and driver is not None:
            # Always close the browser
            driver.quit()
            logging.info("Successfully closed web browser.")
            logging.info(
                "Completed downloading of all COVID19 pdfs from PAHO website.")
Example #4
0
    # Clean the storage folder
    os.system("rm -rf storage/*")

    # kill any firefox instance
    os.system("pgrep firefox | xargs kill")

    # Base url
    base_url = "https://iwillteachyoualanguage.teachable.com/"
    parent_dir = os.path.dirname(os.path.abspath(__file__)) + "/storage"

    # Login
    url = 'https://sso.teachable.com/secure/136458/users/sign_in?clean_login=true&reset_purchase_session=1'

    options = Options()
    options.headless = True
    options.accept_insecure_certs = True
    browser = webdriver.Firefox(options=options)
    browser.get(url)
    time.sleep(5)

    # Scraping
    page_source = browser.page_source

    # filling the form
    username = browser.find_element_by_id("user_email")
    password = browser.find_element_by_id("user_password")

    username.send_keys("") # user email
    password.send_keys("") #user Password

    browser.find_element_by_name("commit").click()