Esempio n. 1
0
def get_logged_in_browser(headless=True):
    """Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen """
    browser = scraping.get_selenium_browser(headless=headless)
    browser.get("http://parankusan.cloudapp.net/Integrated/Texts.aspx")
    username = browser.find_element_by_id("txtUserName")
    username.send_keys(configuration_parankusha["user"])
    browser.find_element_by_id("btnNext").click()
    browser.find_element_by_id("txtPassword").send_keys(configuration_parankusha["pass"])
    browser.find_element_by_id("btnLogin").click()
    browser.get("http://parankusan.cloudapp.net/Integrated/Texts.aspx")
    return browser
Esempio n. 2
0
def get_logged_in_browser(headless=True):
    """Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen """
    browser = scraping.get_selenium_browser(headless=headless)
    browser.get("https://vaakya.vedanidhi.in/login/")
    username = browser.find_element_by_id("username")
    username.send_keys(site_configuration["user"])
    browser.find_element_by_id("password").send_keys(
        site_configuration["pass"])
    browser.find_element_by_id("submit_button").click()
    browser.get("https://vaakya.vedanidhi.in/browse/?lang=En")
    return browser
import os

from selenium.webdriver.remote.remote_connection import LOGGER

from curation_utils import scraping

LOGGER.setLevel(logging.WARNING)
from urllib3.connectionpool import log as urllibLogger
urllibLogger.setLevel(logging.WARNING)

logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s")


browser = scraping.get_selenium_browser(headless=False)


def dump_text(url, out_path, overwrite=True):
    if overwrite == False and os.path.exists(out_path):
        logging.info("Not overwriting %s to %s", url, out_path)
        return
    logging.info("Dumping %s to %s", url, out_path)
    browser.get(url)
    text_elements = browser.find_elements_by_css_selector("div.sam")
    if len(text_elements) == 0:
        text_elements = [browser.find_elements_by_css_selector("table")[-1]]
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with codecs.open(out_path, "w", 'utf-8') as file_out:
        for text_element in text_elements:
            text = text_element.text.replace("\n", "  \n") + "\n"
Esempio n. 4
0
import regex
import logging

from bs4 import BeautifulSoup, Tag
from indic_transliteration import sanscript

from curation_utils import scraping, file_helper
from doc_curation.md.file import MdFile

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s")

browser = scraping.get_selenium_browser(headless=True)


def get_title(title_iast_in):
    title_iast = title_iast_in.replace("Part ", "").replace(
        "Volume ", "").replace("volume ", "").replace("Vol. ", "").replace(
            "with ", "").replace("text ", "").replace("version ", "").replace(
                "by ",
                "").replace("commentary ",
                            "").replace("commentaries ",
                                        "").replace("rescension ",
                                                    "").replace("ṣṃ",
                                                                "ṣm").strip()
    return title_iast