def create_http_session(hostname): sess = requests.Session() cache = FileCache('.webcache') basic_adapter = CacheControlAdapter(cache=cache) forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) sess.mount('http://', basic_adapter) sess.mount('https://', basic_adapter) sess.mount('http://www.' + hostname, forever_adapter) sess.mount('https://www.' + hostname, forever_adapter) return sess
def __init__(self, main_source_domain=None, start_page=None): if main_source_domain is None and start_page is None: raise ValueError( 'Need to specify main_source_domain or start_page.') if main_source_domain: self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/') self.START_PAGE = self.MAIN_SOURCE_DOMAIN if self.MAIN_SOURCE_DOMAIN is None: self.MAIN_SOURCE_DOMAIN = urlparse(start_page).netloc if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS: self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN) if start_page: self.START_PAGE = start_page # keep track of broken links self.broken_links = [] forever_adapter = CacheControlAdapter( heuristic=CacheForeverHeuristic(), cache=self.CACHE) for source_domain in self.SOURCE_DOMAINS: self.SESSION.mount( source_domain, forever_adapter ) # TODO: change to less aggressive in final version
import requests import time from selenium import webdriver from requests_file import FileAdapter from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter DOWNLOAD_SESSION = requests.Session( ) # Session for downloading content from urls DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3)) DOWNLOAD_SESSION.mount('file://', FileAdapter()) cache = FileCache('.webcache') forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) DOWNLOAD_SESSION.mount('http://', forever_adapter) DOWNLOAD_SESSION.mount('https://', forever_adapter) def read(path, loadjs=False, session=None, driver=None): """ read: Reads from source and returns contents Args: path: (str) url or local path to download loadjs: (boolean) indicates whether to load js (optional) session: (requests.Session) session to use to download (optional) driver: (selenium.webdriver) webdriver to use to download (optional) Returns: str content from file or page """ session = session or DOWNLOAD_SESSION try: if loadjs: # Wait until js loads then return contents
import selenium.webdriver.support.ui as selenium_ui from distutils.dir_util import copy_tree # CHEF and CONTENT DEBUG ################################################################################ DEBUG_MODE = False # print extra-verbose info DOWNLOAD_ONE_TO_webroot = False # produce debug webroot/ and skip cheffing DOWNLOAD_ONLY_N = False # chef only first N books; set to False to disable sess = requests.Session() cache = FileCache('.webcache') chefdev_adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1), cache=cache) sess.mount('http://3asafeer.com/', chefdev_adapter) sess.mount('http://fonts.googleapis.com/', chefdev_adapter) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive" } # PAUSES and DELAYS ################################################################################ LOADING_WAIT_TIME = 10 # long delay LOADING_WAIT_TIME_MS = LOADING_WAIT_TIME*1000 LOADING_WAIT_TIME_SHORT = 7 # short delay
PRADIGI_LICENSE = get_license(licenses.CC_BY_NC_SA, copyright_holder='PraDigi').as_dict() PRADIGI_WEBSITE_LANGUAGES = ['Hindi'] PRADIGI_DESCRIPTION = 'Developed by Pratham, these educational games, videos, ' \ 'and ebooks are designed to teach language learning, math, science, English, ' \ 'health, and vocational training in Hindi, Marathi, Odia, Bengali, Urdu, ' \ 'Punjabi, Kannada, Tamil, Telugu, Gujarati and Assamese. Materials are ' \ 'designed for learners of all ages, including those outside the formal classroom setting.' # In debug mode, only one topic is downloaded. LOGGER.setLevel(logging.DEBUG) DEBUG_MODE = True # source_urls in content desriptions # WebCache logic (downloaded web resources cached for one day -- good for dev) cache = FileCache('.webcache') basic_adapter = CacheControlAdapter(cache=cache) develop_adapter = CacheControlAdapter(heuristic=OneDayCache(), cache=cache) session = requests.Session() session.mount('http://www.' + PRADIGI_DOMAIN, develop_adapter) session.mount('https://www.' + PRADIGI_DOMAIN, develop_adapter) CHEF_DIR = os.path.dirname(os.path.realpath(__file__)) print(CHEF_DIR) # LOCALIZATION AND TRANSLATION STRINGS ################################################################################ PRADIGI_STRINGS = { 'hi': { 'language_en': 'Hindi', 'website_lang': 'hn', 'gamesrepo_suffixes': ['_KKS', '_HI', '_Hi'],