def create_http_session(hostname):
    sess = requests.Session()
    cache = FileCache('.webcache')
    basic_adapter = CacheControlAdapter(cache=cache)
    forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(),
                                          cache=cache)
    sess.mount('http://', basic_adapter)
    sess.mount('https://', basic_adapter)
    sess.mount('http://www.' + hostname, forever_adapter)
    sess.mount('https://www.' + hostname, forever_adapter)
    return sess
Exemple #2
0
    def __init__(self, main_source_domain=None, start_page=None):
        if main_source_domain is None and start_page is None:
            raise ValueError(
                'Need to specify main_source_domain or start_page.')
        if main_source_domain:
            self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/')
            self.START_PAGE = self.MAIN_SOURCE_DOMAIN
        if self.MAIN_SOURCE_DOMAIN is None:
            self.MAIN_SOURCE_DOMAIN = urlparse(start_page).netloc
        if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS:
            self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN)
        if start_page:
            self.START_PAGE = start_page

        # keep track of broken links
        self.broken_links = []

        forever_adapter = CacheControlAdapter(
            heuristic=CacheForeverHeuristic(), cache=self.CACHE)
        for source_domain in self.SOURCE_DOMAINS:
            self.SESSION.mount(
                source_domain, forever_adapter
            )  # TODO: change to less aggressive in final version
Exemple #3
0
import requests
import time
from selenium import webdriver
from requests_file import FileAdapter
from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter

DOWNLOAD_SESSION = requests.Session(
)  # Session for downloading content from urls
DOWNLOAD_SESSION.mount('https://',
                       requests.adapters.HTTPAdapter(max_retries=3))
DOWNLOAD_SESSION.mount('file://', FileAdapter())
cache = FileCache('.webcache')
forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(),
                                      cache=cache)

DOWNLOAD_SESSION.mount('http://', forever_adapter)
DOWNLOAD_SESSION.mount('https://', forever_adapter)


def read(path, loadjs=False, session=None, driver=None):
    """ read: Reads from source and returns contents
        Args:
            path: (str) url or local path to download
            loadjs: (boolean) indicates whether to load js (optional)
            session: (requests.Session) session to use to download (optional)
            driver: (selenium.webdriver) webdriver to use to download (optional)
        Returns: str content from file or page
    """
    session = session or DOWNLOAD_SESSION
    try:
        if loadjs:  # Wait until js loads then return contents
import selenium.webdriver.support.ui as selenium_ui
from distutils.dir_util import copy_tree



# CHEF and CONTENT DEBUG
################################################################################
DEBUG_MODE = False                    # print extra-verbose info
DOWNLOAD_ONE_TO_webroot = False       # produce debug webroot/ and skip cheffing
DOWNLOAD_ONLY_N = False               # chef only first N books; set to False to disable



sess = requests.Session()
cache = FileCache('.webcache')
chefdev_adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1), cache=cache)
sess.mount('http://3asafeer.com/', chefdev_adapter)
sess.mount('http://fonts.googleapis.com/', chefdev_adapter)


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive"
}

# PAUSES and DELAYS
################################################################################
LOADING_WAIT_TIME = 10                                             # long delay
LOADING_WAIT_TIME_MS = LOADING_WAIT_TIME*1000
LOADING_WAIT_TIME_SHORT = 7                                        # short delay
Exemple #5
0
PRADIGI_LICENSE = get_license(licenses.CC_BY_NC_SA,
                              copyright_holder='PraDigi').as_dict()
PRADIGI_WEBSITE_LANGUAGES = ['Hindi']
PRADIGI_DESCRIPTION = 'Developed by Pratham, these educational games, videos, ' \
                      'and ebooks are designed to teach language learning, math, science, English, ' \
                      'health, and vocational training in Hindi, Marathi, Odia, Bengali, Urdu, ' \
                      'Punjabi, Kannada, Tamil, Telugu, Gujarati and Assamese. Materials are ' \
                      'designed for learners of all ages, including those outside the formal classroom setting.'

# In debug mode, only one topic is downloaded.
LOGGER.setLevel(logging.DEBUG)
DEBUG_MODE = True  # source_urls in content desriptions

# WebCache logic (downloaded web resources cached for one day -- good for dev)
cache = FileCache('.webcache')
basic_adapter = CacheControlAdapter(cache=cache)
develop_adapter = CacheControlAdapter(heuristic=OneDayCache(), cache=cache)
session = requests.Session()
session.mount('http://www.' + PRADIGI_DOMAIN, develop_adapter)
session.mount('https://www.' + PRADIGI_DOMAIN, develop_adapter)

CHEF_DIR = os.path.dirname(os.path.realpath(__file__))
print(CHEF_DIR)

# LOCALIZATION AND TRANSLATION STRINGS
################################################################################
PRADIGI_STRINGS = {
    'hi': {
        'language_en': 'Hindi',
        'website_lang': 'hn',
        'gamesrepo_suffixes': ['_KKS', '_HI', '_Hi'],