def copy_images_facebook_mohtt(image_dir_path): try: # open the browser logging.info("Now opening the Firefox browser") options = Options() options.headless = True options.accept_insecure_certs = True profile = FirefoxProfile() profile.set_preference('security.tls.version.enable-deprecated', True) driver = webdriver.Firefox(profile, options=options) moh_fb_url = "https://www.facebook.com/pg/MinistryofHealthTT/posts/?ref=page_internal" logging.info("Navigating to " + moh_fb_url) driver.get(moh_fb_url) # scroll down on the page until the first report is found logging.info("Now trying to scroll to find case #1.") report_1_found = False while not report_1_found: logging.info("First report not found. Scrolling some more.") # Scroll down to bottom of page driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # click not now when facebook asks to sign in not_now_button = driver.find_element_by_id( "expanding_cta_close_button") if not_now_button.text == 'Not Now': not_now_button.click() # get all paragraph elements on the page p_elements = driver.find_elements_by_tag_name("p") # check the text in each p element for p_element in p_elements: if p_element.text == "#MediaRelease: COVID-19 Update #1": driver.execute_script("arguments[0].scrollIntoView();", p_element) report_1_found = True logging.info("First report found. Stopped scrolling.") logging.info("Found the 1st report from the MoH.") logging.info("Now downloading all reports on the page.") # get all paragraph elements on the page p_elements = driver.find_elements_by_tag_name("p") # check the text in each p element for keywords keywords = ['update #', 'update no'] for p_element in p_elements: if any(s in p_element.text.lower() for s in keywords): try: logging.info("Now trying to download image from: " + p_element.text) # set the filename filename = p_element.text # get the parent node of this element (should be a div) div_parent = p_element.find_element_by_xpath("./..") # the images are contained in the next div following_div = div_parent.find_elements_by_xpath( "following-sibling::div") # then get all the a elements in the following div a_elements = following_div[0].find_elements_by_tag_name( "a") # the image we want should be the first a_element a_elements[0].click() element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "spotlight"))) # find the image on the page spotlight_image = driver.find_element_by_class_name( "spotlight").get_attribute("src") # set the download path on the local machine img_download_path = os.path.join(image_dir_path, filename + ".jpg") # download the image to the local machine urllib.request.urlretrieve(spotlight_image, img_download_path) logging.info("Successfully downloaded image.") # find the close button u_elements = driver.find_elements_by_tag_name("u") for element in u_elements: if element.text == "Close": # click the close button u_parent = element.find_element_by_xpath("./..") u_parent.click() break except Exception as exc: logging.error("Unable to download image.") # click not now when facebook asks to sign in not_now_button = driver.find_element_by_id( "expanding_cta_close_button") if not_now_button.text == 'Not Now': not_now_button.click() except: logging.info( "Encountered an issue while trying to download the images.") raise else: logging.info( "Completed downloading of all images from MoH Facebook page.") return 0 finally: if 'driver' in locals() and driver is not None: # Always close the browser driver.quit() logging.info("Successfully closed web browser.")
from pathlib import Path from selenium import webdriver from selenium.webdriver import FirefoxProfile, Proxy import os, time from selenium.webdriver.common.proxy import ProxyType from selenium.webdriver.firefox.options import Options p = Proxy() p.proxy_type = ProxyType.MANUAL p.httpProxy = "1.1.1.1:8080" x = Options() x.accept_insecure_certs = True # x.headless = True # x.proxy = p # x.accept_insecure_certs = True # x.set_preference("browser.download.defaultFolder", str(Path(os.getcwd()).parent) + os.path.sep + "AutomationDownloads") # myProxy = "86.111.144.194:3128" # proxy = Proxy({ # 'proxyType': ProxyType.MANUAL, # 'httpProxy': myProxy, # 'ftpProxy': myProxy, # 'sslProxy': myProxy, # 'noProxy':''})
def download_pdfs(): """Use selenium and the firefox browser to download all COVID19 reports from the PAHO website""" try: # create the download folder if it does not exist already Path(paho_raw_reports_dir).mkdir(parents=True, exist_ok=True) # remove all current pdfs in the download folder filelist = [ f for f in os.listdir(paho_raw_reports_dir) if f.endswith(".pdf") ] for f in filelist: os.remove(os.path.join(paho_raw_reports_dir, f)) # open the browser logging.info("Now opening the Firefox browser") options = Options() options.headless = True options.accept_insecure_certs = True profile = FirefoxProfile() profile.set_preference('security.tls.version.enable-deprecated', True) # set the download location of the pdfs and remove the download prompt profile.set_preference("browser.altClickSave", True) profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.panel.shown", False) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", paho_raw_reports_dir) profile.set_preference("browser.download.useDownloadDir", True) profile.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf,application/octet-stream,application/x-winzip,application/x-gzip" ) profile.set_preference("browser.download.manager.alertOnEXEOpen", False) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.manager.focusWhenStarting", False) profile.set_preference("browser.helperApps.alwaysAsk.force", False) profile.set_preference("browser.download.manager.alertOnEXEOpen", False) profile.set_preference("browser.download.manager.closeWhenDone", True) profile.set_preference("browser.download.manager.showAlertOnComplete", False) profile.set_preference("browser.download.manager.useWindow", False) profile.set_preference( "services.sync.prefs.sync.browser.download.manager.showWhenStarting", False) profile.set_preference("pdfjs.disabled", True) driver = webdriver.Firefox(profile, options=options) # Go the PAHO website that holds the reports reports_present_on_page = True page_number = 0 pahoreporturl = "https://www.paho.org/en/technical-reports?topic=4922&d%5Bmin%5D=&d%5Bmax%5D=&page=" + str( page_number) while reports_present_on_page: logging.info("Navigating to " + pahoreporturl) driver.get(pahoreporturl) # get all urls containing certain keywords on this page report_links_elements = driver.find_elements_by_partial_link_text( "COVID-19 cases") # store all of the urls in each element report_links = [] for report_link_element in report_links_elements: report_links.append(report_link_element.get_attribute('href')) # now go through each url in the list for report_link in report_links: # navigate to each url driver.get(report_link) # once the page has loaded, click the download link download_link = driver.find_element_by_link_text("DOWNLOAD") download_link.click() logging.info("File downloaded from: " + download_link.get_attribute('href')) # check if we have any elements that we're interested in on this page, to control the loop if report_links_elements: reports_present_on_page = True page_number += 1 pahoreporturl = "https://www.paho.org/en/technical-reports?topic=4922&d%5Bmin%5D=&d%5Bmax%5D=&page=" + str( page_number) else: reports_present_on_page = False logging.info("No more reports on page. Breaking loop.") return 0 except: logging.info("Encountered an issue while trying to download the pdfs.") raise finally: if 'driver' in locals() and driver is not None: # Always close the browser driver.quit() logging.info("Successfully closed web browser.") logging.info( "Completed downloading of all COVID19 pdfs from PAHO website.")
# Clean the storage folder os.system("rm -rf storage/*") # kill any firefox instance os.system("pgrep firefox | xargs kill") # Base url base_url = "https://iwillteachyoualanguage.teachable.com/" parent_dir = os.path.dirname(os.path.abspath(__file__)) + "/storage" # Login url = 'https://sso.teachable.com/secure/136458/users/sign_in?clean_login=true&reset_purchase_session=1' options = Options() options.headless = True options.accept_insecure_certs = True browser = webdriver.Firefox(options=options) browser.get(url) time.sleep(5) # Scraping page_source = browser.page_source # filling the form username = browser.find_element_by_id("user_email") password = browser.find_element_by_id("user_password") username.send_keys("") # user email password.send_keys("") #user Password browser.find_element_by_name("commit").click()