Ejemplo n.º 1
0
    def __init__(self):
        self._API_SERVER = "https://data.nba.net"

        self._cache_control_adapter = CacheControlAdapter(heuristic=LastModified())
        self._requests_session = requests.Session()
        self._requests_session.mount('http://', CacheControlAdapter())
        self._requests_session.mount('https://', CacheControlAdapter())

        self._TEAM_TRICODES = frozenset(('CHA', 'ATL', 'IND', 'MEM', 'DET',
                                         'UTA', 'CHI', 'TOR', 'CLE', 'OKC',
                                         'DAL', 'MIN', 'BOS', 'SAS', 'MIA',
                                         'DEN', 'LAL', 'PHX', 'NOP', 'MIL',
                                         'HOU', 'NYK', 'ORL', 'SAC', 'PHI',
                                         'BKN', 'POR', 'GSW', 'LAC', 'WAS'))

        self._STAT_CATEGORIES = frozenset(('ppg', 'trpg', 'apg', 'fgp', 'ftp',
                                           'tpp', 'bpg', 'spg', 'tpg', 'pfpg'))

        self._CONFERENCES = frozenset(('west', 'east'))

        self._EASTERN_DIVISIONS = frozenset(('southeast', 'atlantic', 'central'))
        self._WESTERN_DIVISIONS = frozenset(('southwest', 'pacific', 'northwest'))
        self._DIVISIONS = {'west': self._WESTERN_DIVISIONS,
                           'east': self._EASTERN_DIVISIONS}

        # Cached dictionaries. Saving these copies avoids having to
        # re-parse JSONs when they are returned from the HTTP cache.
        self._person_ids = None
        self._team_ids_to_tricodes = None
        self._team_tricodes_to_ids = None
Ejemplo n.º 2
0
 def cache_storage(self, value):
     if value is None:
         self.mount('http://', requests.addapters.HTTPAdapter())
         self.mount('https://', requests.addapters.HTTPAdapter())
     else:
         adapter = CacheControlAdapter(cache=value)
         if self.cache_all:
             adapter.heuristic = ExpiresAfter(days=365)
         self.mount('http://', adapter)
         self.mount('https://', adapter)
Ejemplo n.º 3
0
    def __init__(self, main_source_domain=None, start_page=None):
        if main_source_domain is None and start_page is None:
            raise ValueError('Need to specify main_source_domain or start_page.')
        if main_source_domain:
            self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/')
            self.START_PAGE = self.MAIN_SOURCE_DOMAIN
        if self.MAIN_SOURCE_DOMAIN is None:
            parsedurl = urlparse(start_page)
            self.MAIN_SOURCE_DOMAIN = parsedurl.scheme + '://' + parsedurl.netloc
        if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS:
            self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN)
        if start_page:
            self.START_PAGE = start_page

        # make resolve any redirects
        #verdict, head_response = self.is_html_file(self.START_PAGE)
        is_new_url, content_type, content_length, return_url = self.get_url_type(self.START_PAGE)
        if content_type == 'text/html':
            self.START_PAGE = return_url
        else:
            raise ValueError('The Starting URL ' + self.START_PAGE + ' did not return any html.')

        forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=self.CACHE)
        for source_domain in self.SOURCE_DOMAINS:
            self.SESSION.mount(source_domain, forever_adapter)   # TODO: change to less aggressive in final version
Ejemplo n.º 4
0
def get_events_from_icalendars():
    global now, midnight

    now = localtz.localize(datetime.datetime.now())
    midnight = localtz.localize(datetime.datetime.combine(now, datetime.time(0,0,0)))

    cz = Calzone()

    session = FuturesSession()
    session.mount('https://', CacheControlAdapter(cache=FileCache('.webcache'), heuristic=ForceCacheHeuristic()))

    cals = {k: session.get(u) for k,u in calendars.items()}

    concurrent.futures.wait(cals.values())

    for k,req in cals.items():
        try:
            cz.load(req.result().text)
        except Exception as err:
            print("Failed to load calendar '{}'".format(k))
            print (err)

    try:
        events = cz.get_events(midnight, midnight + datetime.timedelta(days=90))
    except Exception as e:
        print (e)

    events.sort(key=lambda e: e.start)

    return events
Ejemplo n.º 5
0
    def __init__(self):
        super(ProjectHolder, self).__init__()

        app_name = __name__.split('.')[0]

        self.cache_dir = user_cache_dir(app_name)
        log.info("Using cache directory: {}.".format(self.cache_dir))
        self.cache = FileCache(self.cache_dir)
        cache_adapter = CacheControlAdapter(cache=self.cache)
        self.mount("http://", cache_adapter)
        self.mount("https://", cache_adapter)

        self.headers.update(
            {'User-Agent': '{}/{}'.format(app_name, __version__)})
        log.info('Created instance of {}'.format(type(self).__name__))
        self.branches = None
        self.only = None
        self.exclude = None
        self.having_asset = None
        self.hostname = None
        # identifies project on a given hostname
        self.repo = None
        # short name for "repo", useful in URLs
        self.name = None
        # in some case we do not specify repo, but feed is discovered, no repo is given then
        self.feed_url = None
Ejemplo n.º 6
0
def _build_session():
  """Builds a requests session that caches responses where possible, making redirects faster.

  Returns:
      requests.Session -- A shared session to use for the notebook
  """
  result = requests.session()

  # Set up caching.  Particularly obey and cache 307 redirects to avoid duplicate expensive calls when we already
  # have a result
  cache_adapter = CacheControlAdapter()
  cache_adapter.controller = CacheController(cache=cache_adapter.cache, status_codes=(200, 203, 300, 301, 307))

  result.mount('http://', cache_adapter)
  result.mount('https://', cache_adapter)
  return result
Ejemplo n.º 7
0
 def __init__(self, filename=""):
     super(BabelNet, self).__init__()
     if filename == "":
         filename = "babelnet_cache"
     self.mount('https://', CacheControlAdapter(cache=FileCache(filename)))
     self.headers.update({'Accept-Encoding': 'gzip'})
     self.params.update({'key': cfg.babelnet_key})
     self.endpoint = "https://babelnet.io/v4/"
Ejemplo n.º 8
0
    def request(self, method, url, headers=None, params=None, proxies=None, cache=True, verify=False, *args, **kwargs):
        if headers is None: headers = {}
        if params is None: params = {}
        if proxies is None: proxies = {}

        headers['Accept-Encoding'] = 'gzip, deflate'
        headers["User-Agent"] = sickrage.srCore.USER_AGENT

        # request session ssl verify
        if sickrage.srCore.srConfig.SSL_VERIFY:
            try:
                verify = certifi.where()
            except:
                pass

        # request session proxies
        if 'Referer' not in headers and sickrage.srCore.srConfig.PROXY_SETTING:
            sickrage.srCore.srLogger.debug("Using global proxy: " + sickrage.srCore.srConfig.PROXY_SETTING)
            scheme, address = urllib2.splittype(sickrage.srCore.srConfig.PROXY_SETTING)
            address = ('http://{}'.format(sickrage.srCore.srConfig.PROXY_SETTING),
                       sickrage.srCore.srConfig.PROXY_SETTING)[scheme]
            proxies.update({"http": address, "https": address})
            headers.update({'Referer': address})

        # setup caching adapter
        if cache:
            adapter = CacheControlAdapter(DBCache(os.path.abspath(os.path.join(sickrage.DATA_DIR, 'sessions.db'))))
            self.mount('http://', adapter)
            self.mount('https://', adapter)

        # get web response
        response = super(srSession, self).request(
            method,
            url,
            headers=headers,
            params=params,
            verify=verify,
            proxies=proxies,
            *args, **kwargs
        )

        try:
            # check web response for errors
            response.raise_for_status()
        except requests.exceptions.SSLError as e:
            if ssl.OPENSSL_VERSION_INFO < (1, 0, 1, 5):
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}' You have {}, try upgrading OpenSSL to 1.0.1e+".format(
                        e.request.url, ssl.OPENSSL_VERSION))

            if sickrage.srCore.srConfig.SSL_VERIFY:
                sickrage.srCore.srLogger.info(
                    "SSL Error requesting url: '{}', try disabling cert verification in advanced settings".format(
                        e.request.url))
        except Exception:
            pass

        return response
Ejemplo n.º 9
0
def set_cache_expiration_delay(seconds):
    """
    Set a cache for requests with a given expiration time.
    """
    adapter = CacheControlAdapter(
        heuristic=ExpiresAfter(seconds=seconds)
    )
    session.mount('http://', adapter)
    return session
Ejemplo n.º 10
0
 def session(self):
     if self._session is None:
         self._session = real_requests.Session()
         if CacheControlAdapter:
             adapter = CacheControlAdapter(cache=FileCache(".webcache"))
             self._session.mount("http://", adapter)
             self._session.mount("https://", adapter)
             print("Caching to .webcache")
     return self._session
Ejemplo n.º 11
0
def _get_requests_cache_adapter(heuristic):
    """
    Given a heuristic, constructs and returns a
    :class:`cachecontrol.CacheControlAdapter` attached to the instance's
    :data:`requests_cache`.

    """
    return CacheControlAdapter(cache=requests_cache,
                               heuristic=heuristic,
                               cache_etags=False)
Ejemplo n.º 12
0
Archivo: auth.py Proyecto: alin23/spfy
 def get_session(*args, **kwargs):
     session = OAuth2Session(*args, **kwargs)
     cache_adapter = CacheControlAdapter(
         cache=FileCache(CACHE_FILE),
         pool_connections=config.http.connections,
         pool_maxsize=config.http.connections,
         max_retries=config.http.retries,
     )
     session.mount("http://", cache_adapter)
     return session
Ejemplo n.º 13
0
    def __init__(self, headers=None, cookies=None, cache_name=None, delay=1, expire_hours=12, as_string=False):
        '''
        Base class for common scraping tasks
        Args:
            headers: dict of headers
            cookies: cookiejar object
            cache_name: should be full path
            delay: int (be polite!!!)
            expire_hours: int - default 4
            as_string: get string rather than parsed json
        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())

        if not cookies:
            try:
                import cookielib
                cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError) as e:
                try:
                    import http.cookiejar
                    cookies = http.cookiejar.MozillaCookieJar()
                except Exception as e:
                    pass

        _s = requests.Session()
        _s.cookies = cookies

        if headers:
            _s.headers.update(headers)
        else:
            _s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'})

        if cache_name:
            if not '/' in cache_name:
                cache_name = os.path.join('/tmp', cache_name)
            try:
                from cachecontrol import CacheControlAdapter
                from cachecontrol.heuristics import ExpiresAfter
                from cachecontrol.caches import FileCache
                _s.mount('http://', CacheControlAdapter(cache=FileCache(cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=expire_hours)))
            except ImportError as e:
                try:
                    import requests_cache
                    requests_cache.install_cache(cache_name)
                except:
                    pass

        self.s = _s
        self.urls = []
        self.as_string = as_string

        if delay > 0:
            self.delay = delay
        else:
            self.delay = None
Ejemplo n.º 14
0
def getsession(timeframe):
    if timeframe in sessions:
        return sessions[timeframe]
    else:
        sess = requests.Session()
        sess.cookies = __cookie
        if timeframe == 0:
            sess.mount("http://", CacheControlAdapter(cache=__cache))
            sess.mount("https://", CacheControlAdapter(cache=__cache))
        else:
            sess.mount(
                "http://",
                CacheControlAdapter(cache=__cache,
                                    heuristic=timecache(timeframe)))
            sess.mount(
                "https://",
                CacheControlAdapter(cache=__cache,
                                    heuristic=timecache(timeframe)))
        sessions[timeframe] = sess
        return sess
Ejemplo n.º 15
0
def add_cache_control(session, cache_control_config):
    """Add cache_control adapter to session object."""
    adapter = CacheControlAdapter(
        DictCache(),
        cache_etags=cache_control_config.get('cache_etags', True),
        serializer=cache_control_config.get('serializer', None),
        heuristic=cache_control_config.get('heuristic', None),
    )
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.cache_controller = adapter.controller
Ejemplo n.º 16
0
 def _create_session(self, max_retries, proxies, backoff_factor, cache):
     sess = Session()
     # Retry only on idempotent methods and only when too many requests
     retries = Retry(total=max_retries, backoff_factor=backoff_factor, status_forcelist=[429], method_whitelist=['GET', 'UPDATE', 'DELETE'])
     retries_adapter = HTTPAdapter(max_retries=retries)
     if cache:
         cache_adapter = CacheControlAdapter(cache_etags=True)
     sess.mount('http://', retries_adapter)
     sess.mount('http://', cache_adapter)
     sess.proxies.update(proxies)  
     return sess
Ejemplo n.º 17
0
    def __init__(self, proxies=None, cache=True):
        super(WebSession, self).__init__()

        # setup caching adapter
        if cache:
            adapter = CacheControlAdapter()
            self.mount('http://', adapter)
            self.mount('https://', adapter)

        # add proxies
        self.proxies = proxies or _add_proxies()

        # add hooks
        self.hooks['response'] += [WebHooks.log_url, WebHooks.cloudflare]
Ejemplo n.º 18
0
 def __init__(self, url, max_retries, expires_after_sec):
     self.hasError = False
     self.fromCache = False
     self.url = url
     self.cacheEnabled = expires_after_sec > 0
     self.session = requests.Session()
     retryPolicy = urllib3.util.Retry(max_retries, status_forcelist=[400])
     if self.cacheEnabled:
         self.session.mount(
             url,
             CacheControlAdapter(
                 max_retries=retryPolicy,
                 heuristic=ExpiresAfter(seconds=expires_after_sec)))
     else:
         self.session.mount(
             url, requests.adapters.HTTPAdapter(max_retries=retryPolicy))
Ejemplo n.º 19
0
    def __init__(self, proxies=None, cache=True):
        super(WebSession, self).__init__()

        # setup caching adapter
        if cache:
            adapter = CacheControlAdapter()
            self.mount('http://', adapter)
            self.mount('https://', adapter)

        # add proxies
        self.proxies = proxies or _add_proxies()

        # add hooks
        self.hooks['response'] += [WebHooks.log_url, WebHooks.cloudflare]

        # add headers
        self.headers.update({'Accept-Encoding': 'gzip, deflate', 'User-Agent': sickrage.app.user_agent})
Ejemplo n.º 20
0
 def __init__(
     self,
     user=None,
     prefix_url=settings.SITE_URL,
     save_auth=None,
     verify=settings.CERN_SSL_CHAIN,
 ):
     super(Session, self).__init__()
     self.user = user if user else User(save_auth=save_auth)
     self.auth = self._authorize
     self.prefix_url = prefix_url
     self.verify = verify
     # store last call
     self._response = None
     # add caching
     super(Session, self).mount(
         self.prefix_url, CacheControlAdapter(cache=FileCache('.webcache'))
     )
Ejemplo n.º 21
0
    def __init__(self, *args, **kwargs):
        retries = kwargs.pop("retries", 0)
        cache = kwargs.pop("cache", None)
        super(PypiSession, self).__init__(*args, **kwargs)

        # Attach our User Agent to the request
        self.headers["User-Agent"] = user_agent()

        # Attach our Authentication handler to the session
        self.auth = MultiDomainBasicAuth()

        # Create our urllib3.Retry instance which will allow us to customize
        # how we handle retries.
        retries = urllib3.Retry(
            # Set the total number of retries that a particular request can
            # have.
            total=retries,

            # A 503 error from PyPI typically means that the Fastly -> Origin
            # connection got interupted in some way. A 503 error in general
            # is typically considered a transient error so we'll go ahead and
            # retry it.
            status_forcelist=[503],

            # Add a small amount of back off between failed requests in
            # order to prevent hammering the service.
            backoff_factor=0.25,
        )

        # We want to _only_ cache responses on securely fetched origins. We do
        # this because we can't validate the response of an insecurely fetched
        # origin, and we don't want someone to be able to poison the cache and
        # require manual eviction from the cache to fix it.
        if cache:
            secure_adapter = CacheControlAdapter(
                cache=SafeFileCache(cache, use_dir_lock=True),
                max_retries=retries,
            )
        else:
            secure_adapter = HTTPAdapter(max_retries=retries)

        self.mount("https://", secure_adapter)
        self.mount("file://", LocalFSAdapter())
Ejemplo n.º 22
0
def get_feed(feed_url):
    """
    Return feed parsed feed
    """
    requests_timeout = getattr(settings, 'FEED_TIMOUT', 1)

    cache_adapter = CacheControlAdapter(
        cache=FileCache('.web_cache'),
        heuristic=ExpiresAfter(hours=1),
    )

    session = requests.Session()
    session.mount('http://', cache_adapter)
    session.mount('https://', cache_adapter)

    show_exceptions = getattr(settings, 'DEBUG', True)

    feed_request = session.get(feed_url, timeout=requests_timeout)

    return feedparser.parse(feed_request.text)
Ejemplo n.º 23
0
    def _build_session(self, max_retries):
        from requests.adapters import HTTPAdapter

        if not isinstance(max_retries, int):
            raise ValueError(f'int expected, found {type(max_retries)}.')
        elif max_retries < 1:
            raise ValueError('max_retries should be greater or equal to 1.')

        session = requests.Session()

        # mount retries adapter
        session.mount(
            'http://',
            HTTPAdapter(max_retries=Retry(total=max_retries,
                                          method_whitelist=frozenset(
                                              ['GET', 'POST']))))

        # mount cache adapter
        session.mount('http://',
                      CacheControlAdapter(heuristic=ExpiresAfter(hours=1)))

        session.headers['User-Agent'] = USER_AGENT

        self.session = session
Ejemplo n.º 24
0
    def __init__(self, main_source_domain=None, start_page=None):
        if main_source_domain is None and start_page is None:
            raise ValueError(
                'Need to specify main_source_domain or start_page.')
        if main_source_domain:
            self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/')
            self.START_PAGE = self.MAIN_SOURCE_DOMAIN
        if self.MAIN_SOURCE_DOMAIN is None:
            parsedurl = urlparse(start_page)
            self.MAIN_SOURCE_DOMAIN = parsedurl.scheme + '://' + parsedurl.netloc
        if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS:
            self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN)
        if start_page:
            self.START_PAGE = start_page

        # keep track of broken links
        self.broken_links = []

        forever_adapter = CacheControlAdapter(
            heuristic=CacheForeverHeuristic(), cache=self.CACHE)
        for source_domain in self.SOURCE_DOMAINS:
            self.SESSION.mount(
                source_domain, forever_adapter
            )  # TODO: change to less aggressive in final version
import zipfile
from typing import Any, Dict, Optional

import aiohttp
import requests
from cachecontrol import CacheControl, CacheControlAdapter
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import ExpiresAfter

from shared import configuration, perf
from shared.pd_exception import OperationalException

SESSION = CacheControl(requests.Session(),
                       cache=FileCache(configuration.get('web_cache')))
SESSION.mount('http://whatsinstandard.com',
              CacheControlAdapter(heuristic=ExpiresAfter(days=14)))

AIOSESSION = aiohttp.ClientSession()


def unzip(url: str, path: str) -> str:
    location = '{scratch_dir}/zip'.format(
        scratch_dir=configuration.get('scratch_dir'))

    def remove_readonly(func, path, _):
        os.chmod(path, stat.S_IWRITE)
        func(path)

    shutil.rmtree(location, True, remove_readonly)
    os.mkdir(location)
    store(url, '{location}/zip.zip'.format(location=location))
Ejemplo n.º 26
0
    def __init__(self, **kwargs):
        '''
        Base class for common scraping tasks

        Args:

        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()

        # delay/expire
        if kwargs.get('delay'):
            self.delay = kwargs['delay']
        else:
            self.delay = 2

        if kwargs.get('expire_hours'):
            self.expire_hours = kwargs['expire_hours']
        else:
            self.expire_hours = 168

        # add cookies
        if kwargs.get('cookies'):
            _s.cookies = kwargs['cookies']
        else:
            try:
                import cookielib
                _s.cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError):
                import http.cookiejar
                _s.cookies = http.cookiejar.MozillaCookieJar()
                
        # add headers
        if kwargs.get('headers'):
            _s.headers = kwargs['headers']
        else:
            ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
            _s.headers = {'User-Agent': ua}

        # add proxies
        if kwargs.get('proxies'):
            _s.proxies = kwargs['proxies']

        # add cache
        if not '/' in kwargs.get('cache_name', ''):
            self.cache_name = os.path.join('/tmp', kwargs['cache_name'])
        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache
            _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), 
                                    cache_etags = False,
                                    heuristic=ExpiresAfter(hours=self.expire_hours)))
        except ImportError as e:
            try:
                import requests_cache
                requests_cache.install_cache(self.cache_name)
            except:
                logging.exception('could not install cache')
        self.s = _s
Ejemplo n.º 27
0
@oauth_authorized.connect_via(github_bp)
def github_logged_in(blueprint, token):
    if not token:
        flash("Failed to log in with Github")
    if "error_reason" in token:
        msg = "Access denied. Reason={reason} error={error}".format(
            reason=request.args["error_reason"],
            error=request.args["error_description"],
        )
        flash(msg)
    else:
        flash("Successfully signed in with Github")


# install CacheControl for github session, so we don't eat up API usage unnecessarily
github_bp.session.mount(github_bp.session.base_url, CacheControlAdapter())

## UTILITY FUNCTIONS ##


def jira_get(*args, **kwargs):
    """
    JIRA sometimes returns an empty response to a perfectly valid GET request,
    so this will retry it a few times if that happens.
    """
    for _ in range(3):
        resp = jira_bp.session.get(*args, **kwargs)
        if resp.content:
            return resp
    return jira_bp.session.get(*args, **kwargs)
Ejemplo n.º 28
0
import os, datetime
import requests
from .data.rfeed import Item, Feed
from flask import Flask, jsonify, request as flask_request
from cachecontrol import CacheControlAdapter
from cachecontrol.heuristics import LastModified

app = Flask(__name__)

adapter = CacheControlAdapter(heuristic=LastModified())

sess = requests.Session()
sess.mount('http://', adapter)
sess.mount('https://', adapter)

SERVICE_NAME = os.path.splitext(os.path.basename(__file__))[0]


@app.route("/rss/summary", methods=['GET'])
def latest_articles():
    if flask_request.method == 'GET':
        response = sess.get('http://localhost/article/collect/10')
        article_collection = []
        if response.status_code == requests.codes.ok:
            articles = response.json()['success']
            for article in articles:
                article_collection.append(
                    Item(
                        title=article['title'],
                        author=article['author'],
                        pubDate=datetime.datetime.strptime(
Ejemplo n.º 29
0
def cli(url, repositories, search, table, rows, minstar, report, description,
        token):
    MODE = os.environ.get("GHTOPDEP_ENV")
    BASE_URL = 'https://437w61gcj1.execute-api.us-west-2.amazonaws.com/api'
    if MODE == "development":
        BASE_URL = 'http://127.0.0.1:8080'

    if report:
        try:
            result = requests.get('{}/repos?url={}'.format(BASE_URL, url))
            if result.status_code != 404:
                sorted_repos = sort_repos(result.json()['deps'], rows)
                repos = readable_stars(sorted_repos)
                click.echo(tabulate(repos, headers="keys", tablefmt="github"))
                sys.exit()
        except requests.exceptions.ConnectionError as e:
            click.echo(e)

    if (description or search) and token:
        gh = github3.login(token=token)
        CacheControl(gh.session,
                     cache=FileCache(CACHE_DIR),
                     heuristic=OneDayHeuristic())
    elif (description or search) and not token:
        click.echo("Please provide token")
        sys.exit()

    destination = "repository"
    destinations = "repositories"
    if not repositories:
        destination = "package"
        destinations = "packages"

    repos = []
    more_than_zero_count = 0
    total_repos_count = 0
    spinner = Halo(text="Fetching information about {0}".format(destinations),
                   spinner="dots")
    spinner.start()

    sess = requests.session()
    retries = Retry(total=15, backoff_factor=15, status_forcelist=[429])
    adapter = CacheControlAdapter(max_retries=retries,
                                  cache=FileCache(CACHE_DIR),
                                  heuristic=OneDayHeuristic())
    sess.mount("http://", adapter)
    sess.mount("https://", adapter)

    page_url = get_page_url(sess, url, destination)

    while True:
        response = sess.get(page_url)
        parsed_node = HTMLParser(response.text)
        dependents = parsed_node.css(ITEM_SELECTOR)
        total_repos_count += len(dependents)
        for dep in dependents:
            repo_stars_list = dep.css(STARS_SELECTOR)
            # only for ghost or private? packages
            if repo_stars_list:
                repo_stars = repo_stars_list[0].text().strip()
                repo_stars_num = int(repo_stars.replace(",", ""))
            else:
                continue

            if repo_stars_num != 0:
                more_than_zero_count += 1
            if repo_stars_num >= minstar:
                relative_repo_url = dep.css(
                    REPO_SELECTOR)[0].attributes["href"]
                repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url)

                # can be listed same package
                is_already_added = already_added(repo_url, repos)
                if not is_already_added and repo_url != url:
                    if description:
                        repo_description = fetch_description(
                            gh, relative_repo_url)
                        repos.append({
                            "url": repo_url,
                            "stars": repo_stars_num,
                            "description": repo_description
                        })
                    else:
                        repos.append({
                            "url": repo_url,
                            "stars": repo_stars_num
                        })

        node = parsed_node.css(NEXT_BUTTON_SELECTOR)
        if len(node) == 2:
            page_url = node[1].attributes["href"]
        elif len(node) == 0 or node[0].text() == "Previous":
            spinner.stop()
            break
        elif node[0].text() == "Next":
            page_url = node[0].attributes["href"]

    if report:
        try:
            requests.post('{}/repos'.format(BASE_URL),
                          json={
                              "url": url,
                              "deps": repos
                          })
        except requests.exceptions.ConnectionError as e:
            click.echo(e)

    sorted_repos = sort_repos(repos, rows)

    if search:
        for repo in repos:
            repo_path = urlparse(repo["url"]).path[1:]
            for s in gh.search_code("{0} repo:{1}".format(search, repo_path)):
                click.echo("{0} with {1} stars".format(s.html_url,
                                                       repo["stars"]))
    else:
        show_result(sorted_repos, total_repos_count, more_than_zero_count,
                    destinations, table)
Ejemplo n.º 30
0
    def __init__(self, **kwargs):
        """
        """
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()
        self.delay = kwargs.get("delay", 2)
        self.expire_hours = kwargs.get("expire_hours", 168)

        # add cookies
        if kwargs.get("cookies"):
            _s.cookies = kwargs["cookies"]
        else:
            import http.cookiejar

            _s.cookies = http.cookiejar.MozillaCookieJar()

        # add headers
        default_headers = {
            "User-Agent": random.choice(USER_AGENTS),
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "accept": "application/json, text/plain, */*",
        }
        _s.headers.update(default_headers)
        if kwargs.get("headers"):
            _s.headers.update(kwargs["headers"])

        # add proxies
        if kwargs.get("proxies"):
            _s.proxies = kwargs["proxies"]

        # add cache
        if not kwargs.get("cache_name"):
            self.cache_name = os.path.join("/tmp", random_string(32))
        elif "/" not in kwargs.get("cache_name", ""):
            self.cache_name = os.path.join("/tmp", kwargs["cache_name"])
        else:
            self.cache_name = kwargs.get("cache_name")

        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache

            _s.mount(
                "http://",
                CacheControlAdapter(
                    cache=FileCache(self.cache_name),
                    cache_etags=False,
                    heuristic=ExpiresAfter(hours=self.expire_hours),
                ),
            )
        except ImportError:
            try:
                import requests_cache

                requests_cache.install_cache(self.cache_name)
            except BaseException:
                logging.exception("could not install cache")
        self.session = _s
Ejemplo n.º 31
0
TIMEOUT_SECONDS = 10  # Seconds before URL query timeout is raised

PROVIDERS_URLS = [
    "https://providers.optimade.org/v1/links",
    "https://raw.githubusercontent.com/Materials-Consortia/providers/master/src"
    "/links/v1/providers.json",
]

CACHE_DIR = Path(appdirs.user_cache_dir("optimade-client", "CasperWA"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHED_PROVIDERS = CACHE_DIR / "cached_providers.json"

SESSION = requests.Session()
SESSION_ADAPTER = CacheControlAdapter(
    cache=FileCache(CACHE_DIR / ".requests_cache"), heuristic=ExpiresAfter(days=1)
)
SESSION_ADAPTER_DEBUG = CacheControlAdapter()
SESSION.mount("http://", SESSION_ADAPTER)
SESSION.mount("https://", SESSION_ADAPTER)
SESSION.mount("http://localhost", SESSION_ADAPTER_DEBUG)
SESSION.mount("http://127.0.0.1", SESSION_ADAPTER_DEBUG)

# Currently known providers' development OPTIMADE base URLs
DEVELOPMENT_PROVIDERS = {"mcloud": "https://dev-www.materialscloud.org/optimade"}

try:
    DEVELOPMENT_MODE = bool(int(os.getenv("OPTIMADE_CLIENT_DEVELOPMENT_MODE", "0")))
except ValueError:
    LOGGER.debug(
        (