def get_logged_in_browser(headless=True): """Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen """ browser = scraping.get_selenium_browser(headless=headless) browser.get("http://parankusan.cloudapp.net/Integrated/Texts.aspx") username = browser.find_element_by_id("txtUserName") username.send_keys(configuration_parankusha["user"]) browser.find_element_by_id("btnNext").click() browser.find_element_by_id("txtPassword").send_keys(configuration_parankusha["pass"]) browser.find_element_by_id("btnLogin").click() browser.get("http://parankusan.cloudapp.net/Integrated/Texts.aspx") return browser
def get_logged_in_browser(headless=True): """Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen """ browser = scraping.get_selenium_browser(headless=headless) browser.get("https://vaakya.vedanidhi.in/login/") username = browser.find_element_by_id("username") username.send_keys(site_configuration["user"]) browser.find_element_by_id("password").send_keys( site_configuration["pass"]) browser.find_element_by_id("submit_button").click() browser.get("https://vaakya.vedanidhi.in/browse/?lang=En") return browser
import os from selenium.webdriver.remote.remote_connection import LOGGER from curation_utils import scraping LOGGER.setLevel(logging.WARNING) from urllib3.connectionpool import log as urllibLogger urllibLogger.setLevel(logging.WARNING) logging.basicConfig( level=logging.DEBUG, format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s") browser = scraping.get_selenium_browser(headless=False) def dump_text(url, out_path, overwrite=True): if overwrite == False and os.path.exists(out_path): logging.info("Not overwriting %s to %s", url, out_path) return logging.info("Dumping %s to %s", url, out_path) browser.get(url) text_elements = browser.find_elements_by_css_selector("div.sam") if len(text_elements) == 0: text_elements = [browser.find_elements_by_css_selector("table")[-1]] os.makedirs(os.path.dirname(out_path), exist_ok=True) with codecs.open(out_path, "w", 'utf-8') as file_out: for text_element in text_elements: text = text_element.text.replace("\n", " \n") + "\n"
import regex import logging from bs4 import BeautifulSoup, Tag from indic_transliteration import sanscript from curation_utils import scraping, file_helper from doc_curation.md.file import MdFile for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig( level=logging.DEBUG, format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s") browser = scraping.get_selenium_browser(headless=True) def get_title(title_iast_in): title_iast = title_iast_in.replace("Part ", "").replace( "Volume ", "").replace("volume ", "").replace("Vol. ", "").replace( "with ", "").replace("text ", "").replace("version ", "").replace( "by ", "").replace("commentary ", "").replace("commentaries ", "").replace("rescension ", "").replace("ṣṃ", "ṣm").strip() return title_iast