def request(self, method, url, params=None, headers=None, to_json=True, data=None, **kwargs): """ Make request to TC API. """ url, params, headers, data = self.prepare(url, params, headers, data) if self.options['cache']: rc.install_cache(self.options['cache']) elif type(self).cache_installed: rc.uninstall_cache() type(self).cache_installed = bool(self.options['cache']) try: response = rs.api.request( method, url, params=params, headers=headers, data=data, **kwargs) logger.debug(response.content) response.raise_for_status() if to_json: response = response.json() except (ValueError, rs.HTTPError): if locals().get('response') is not None: message = "%s: %s" % (response.status_code, response.content) raise TCException(message) raise return response
def get_api_keyboards(verbose=False): """ Get Keyboards data from web api. Args: verbose(bool, default False): verbose output Returns: dict: Keyboard data None: if http request not successful """ api_url = "https://api.keyman.com/cloud/4.0/keyboards?version=10.0" headers = {'Content-Type': 'application/json', 'Accept-Encoding': 'gzip, deflate, br'} home = str(Path.home()) cache_dir = keyman_cache_dir() current_dir = os.getcwd() expire_after = datetime.timedelta(days=1) if not os.path.isdir(cache_dir): os.makedirs(cache_dir) os.chdir(cache_dir) requests_cache.install_cache(cache_name='keyman_cache', backend='sqlite', expire_after=expire_after) now = time.ctime(int(time.time())) response = requests.get(api_url, headers=headers) if verbose: print("Time: {0} / Used Cache: {1}".format(now, response.from_cache)) os.chdir(current_dir) if response.status_code == 200: # return json.loads(response.content.decode('utf-8')) return response.json() else: return None
def setup(self): defaults = dict(name='Matkailu- ja kongressitoimisto') self.data_source, _ = DataSource.objects.get_or_create(id=self.name, defaults=defaults) self.tprek_data_source = DataSource.objects.get(id='tprek') ytj_ds, _ = DataSource.objects.get_or_create(defaults={'name': 'YTJ'}, id='ytj') org_args = dict(origin_id='0586977-6', data_source=ytj_ds) defaults = dict(name='Helsingin Markkinointi Oy') self.organization, _ = Organization.objects.get_or_create( defaults=defaults, **org_args) place_list = Place.objects.filter(data_source=self.tprek_data_source, deleted=False) deleted_place_list = Place.objects.filter(data_source=self.tprek_data_source, deleted=True) # Get only places that have unique names place_list = place_list.annotate(count=Count('name_fi')).filter(count=1).values('id', 'origin_id', 'name_fi') deleted_place_list = deleted_place_list.annotate(count=Count('name_fi')).\ filter(count=1).values('id', 'origin_id', 'name_fi', 'replaced_by_id') self.tprek_by_name = {p['name_fi'].lower(): (p['id'], p['origin_id']) for p in place_list} self.deleted_tprek_by_name = { p['name_fi'].lower(): (p['id'], p['origin_id'], p['replaced_by_id']) for p in deleted_place_list} if self.options['cached']: requests_cache.install_cache('matko')
def reset_cache(self, cache_duration=None): """Remove any cached singles or albums charts Because the UK Top40 charts only change once per week, :py:class:`Top40` will cache the results of singles and albums. This means that during the execution of a program, repeated calls to retrieve singles and albums chart information will only actually call the remote API once. If, for whatever reason you need to ensure that an attempt to access single or album information actually results in a call to the remote API, then calling the :py:meth:`Top40.reset_cache` method will do this, by clearing down any existing cached chart information. If a cache is in place, then the results will also be cached across python runtime executions. Params: cache_duration (:py:class:`int`): If ``None`` we will uninstall the requests cache and the next read from the API will cause a remote call to be executed. Otherwise it specifies the number of seconds before the persistent cache will expire. """ if cache_duration is None: # We are disabling the existing persistent_cache requests_cache.uninstall_cache() else: # We are setting a persistent cache so insert the duration into our cache config self.cache_config['expire_after'] = cache_duration # and then install the cache with this configuration requests_cache.install_cache(**self.cache_config) # Remember the new duration self.cache_duration = cache_duration # Rest the in-memory caches to force a read from remote site self._albums_chart = None self._singles_chart = None
def __get_session() -> requests.Session: """ Get or create a requests session for MTGStocks. :return Session data """ if mtgjson4.USE_CACHE.get(): requests_cache.install_cache( "stocks_cache", backend="sqlite", expire_after=mtgjson4.SESSION_CACHE_EXPIRE_STOCKS, ) session: Optional[requests.Session] = SESSION.get(None) if session is None: session = requests.Session() if mtgjson4.CONFIG_PATH.is_file(): # Open and read MTGJSON secret properties config = configparser.RawConfigParser() config.read(mtgjson4.CONFIG_PATH) SESSION_TOKEN.set(config.get("MTGStocks", "token")) session = util.retryable_session(session) SESSION.set(session) return session
def handle(self, *args, **options): self.logger = logging.getLogger(__name__) if options['cached']: import requests_cache requests_cache.install_cache("update-social") self.updater = FeedUpdater(self.logger) self.updater.update_feeds()
def __init__(self, acs, tz): self.acs = acs self.tz = tz # Install sqlite cache for celestrak with a 24 hour duration # Good enough for celestrak and other data. Cache disabled when appropriate requests_cache.install_cache('teeminus10_cache', expire_after=24*60*60) requests_cache.clear()
def main(): global args parser = argparse.ArgumentParser(description='从国家统计局网站下载最新的行政区') parser.add_argument('input', const="", default="", type=str, nargs="?") parser.add_argument("--sqlite3", type=str, help='SQLite文件位置') parser.add_argument("--mysql", type=str, help='mysql dsn') parser.add_argument('--mysql-host', type=str, help='mysql host') parser.add_argument('--mysql-port', type=str, help='mysql port') parser.add_argument('--mysql-user', type=str, help='mysql user') parser.add_argument('--mysql-password', type=str, help='mysql password') parser.add_argument('--mysql-database', type=str, help='mysql database') parser.add_argument('--skip-province', type=int, help='跳过省份的第x个') parser.add_argument('--verbose', '-v', action='count', help='打印日志内容') parser.add_argument('--dump', action='store', default='txt', \ help='输出内容的格式 csv txt xml json jsonp') parser.add_argument('--dump-children', action='store_true', \ help='打印子级内容') parser.add_argument('--region-type', action='store', default='province', \ help='') parser.add_argument('--requests-cache', action='store', \ default='/tmp/cnregion_requests_cache.sqlite') args = parser.parse_args(sys.argv[1:]) requests_cache.install_cache(args.requests_cache) fetch.VERBOSE_LEVEL = args.verbose printer = Printer(args.dump) if args.region_type == "city": for province in fetch_provinces(): print printer.province(province) if "__main__" == __name__: main()
def pytest_configure(config): if config.getoption('--use-cache'): import requests_cache requests_cache.install_cache('test_cache') api = Api() pytest.game_ids = api.GetSeasonGameIDs('2009-10', 'Regular Season')[:2] # Hack to carry the gameids to tests pytest.game_ids = ['0020900292']
def __init__(self, cache, http_cfg): default_cfg = dict(stream=True, timeout=30.1) for it in default_cfg.items(): http_cfg.setdefault(*it) self.config = DictLike(http_cfg) if cache: requests_cache.install_cache(**cache)
def get_vhosts(ip, first=1, no_cache=False): """Returns a list of webs hosted on IP (checks bing.com) >>> 'www.bing.com' in vhosts(204.79.197.200) True """ if not no_cache: homedir = pwd.getpwuid(os.getuid()).pw_dir requests_cache.install_cache(homedir + '/.habu_requests_cache') url = "http://www.bing.com/search?q=ip:{ip}&first={first}".format(ip=ip, first=first) response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") vhosts = set() for h2 in soup.find_all('h2'): for link in h2.find_all('a'): href = link.get('href') if href.startswith('http://') or href.startswith('https://'): vhost = href.split('/')[2] vhosts.add(vhost) return list(vhosts)
def setUp(self): requests_cache.install_cache( cache_name=os.path.join(os.path.dirname(__file__), "test"), allowable_methods=('GET', 'POST') ) self.ts_beg = datetime.datetime(2015, 3, 5, 0) self.ts_end = datetime.datetime(2015, 3, 5, 3)
def command(self): self._load_config() import ckan.model as model # Cache all HTTP requests for 24 hours requests_cache.install_cache('opennames_cache', expire_after=86400) rev = model.repo.new_revision() print "Processing Organisations" for entities in self.opennames_entity_generator(): for entity in self.closed_generator(entities): group = model.Group.get(entity) if not group: print "Group {} does not exist".format(entity) continue print "Updating {}".format(entity) group.extras['closed'] = True model.Session.add(group) model.Session.commit() print "Processing PCTs" for trust in self.pcts(): print "Updating {}".format(trust.name) trust.extras['closed'] = True trust.extras['replaced_by'] = "national-health-service" model.Session.add(trust) model.Session.commit() model.repo.commit_and_remove()
def enable_cache(fileprefix, cachetype, expiry): """ If the requests_cache package is available, install a cache and begin using it globally. Returns True if caching was successfully enabled, and False otherwise (failed to enable, or enabled already) """ global _CACHE_INSTALLED if _CACHE_INSTALLED: return False try: from requests_cache import install_cache from requests_cache.core import remove_expired_responses install_cache(fileprefix, cachetype, expire_after=expiry) remove_expired_responses() except ImportError: return False else: _CACHE_INSTALLED = True return True
def cmd_crtsh(domain, no_cache, no_validate, verbose): """Downloads the certificate transparency logs for a domain and check with DNS queries if each subdomain exists. Uses multithreading to improve the performance of the DNS queries. Example: \b $ sudo habu.crtsh securetia.com [ "karma.securetia.com.", "www.securetia.com." ] """ if verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') if not no_cache: homedir = pwd.getpwuid(os.getuid()).pw_dir requests_cache.install_cache(homedir + '/.habu_requests_cache', expire_after=3600) subdomains = set() if verbose: print("Downloading subdomain list from https://crt.sh ...", file=sys.stderr) req = requests.get("https://crt.sh/?q=%.{d}&output=json".format(d=domain)) if req.status_code != 200: print("[X] Information not available!") exit(1) json_data = json.loads(req.text) for data in json_data: name = data['name_value'].lower() if '*' not in name: subdomains.add(name) subdomains = list(subdomains) if no_validate: print(json.dumps(sorted(subdomains), indent=4)) return True if verbose: print("Validating subdomains against DNS servers ...", file=sys.stderr) answers = query_bulk(subdomains) validated = [] for answer in answers: if answer: validated.append(str(answer.qname)) print(json.dumps(sorted(validated), indent=4)) return True
def crawl_command(args): requests_cache.install_cache('builder_stats') CBE_BASE = 'https://chrome-build-extract.appspot.com' MASTERS_URL = 'https://chrome-infra-stats.appspot.com/_ah/api/stats/v1/masters' master_names = requests.get(MASTERS_URL).json()['masters'] builder_stats = [] for master_name in master_names: cbe_master_url = '%s/get_master/%s' % (CBE_BASE, master_name) master_json = requests.get(cbe_master_url).json() # print master_json['slaves'].keys() for builder_name, builder_json in master_json['builders'].items(): cbe_builds_url = '%s/get_builds' % CBE_BASE params = { 'master': master_name, 'builder': builder_name } response_json = requests.get(cbe_builds_url, params=params).json() builds = response_json['builds'] if builds: finished_build = next(b for b in builds if b['eta'] is None) first_step_name = finished_build['steps'][0]['name'] else: first_step_name = None builder_tuple = (master_name, builder_name, first_step_name, builder_json['slaves']) print builder_tuple builder_stats.append(builder_tuple) with open('builder_stats.json', 'w') as stats_file: json.dump(builder_stats, stats_file)
def test_fred(): filename = "fred" if expire_after>=0: requests_cache.install_cache(filename, backend='sqlite', expire_after=expire_after) # expiration seconds logging.info("Installing cache '%s.sqlite' with expire_after=%d (seconds)" % (filename, expire_after)) if expire_after==0: logging.warning("expire_after==0 no cache expiration!") start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) #name = "GDP" #name = "CPIAUCSL" #name = "CPILFESL" name = ["CPIAUCSL", "CPILFESL"] #name = ["CPIAUCSL", "CPILFESL", "ERROR"] data = MyDataReader("FRED").get(name, start, end) print(data) gdp = web.DataReader(name, "fred", start, end) print(gdp) print(type(gdp)) print(gdp.ix['2013-01-01']) print(gdp.dtypes) diff = gdp - data assert(diff.sum().sum()==0)
def cmd_cymon_ip_timeline(ip, no_cache, verbose, output, pretty): """Simple cymon API client. Prints the JSON result of a cymon IP timeline query. Example: \b $ habu.cymon.ip.timeline 8.8.8.8 { "timeline": [ { "time_label": "Aug. 18, 2018", "events": [ { "description": "Posted: 2018-08-18 23:37:39 CEST IDS Alerts: 0 URLQuery Alerts: 1 ...", "created": "2018-08-18T21:39:07Z", "title": "Malicious activity reported by urlquery.net", "details_url": "http://urlquery.net/report/b1393866-9b1f-4a8e-b02b-9636989050f3", "tag": "malicious activity" } ] }, ... """ habucfg = loadcfg() if 'CYMON_APIKEY' not in habucfg: print('You must provide a cymon apikey. Use the ~/.habu.json file (variable CYMON_APIKEY), or export the variable HABU_CYMON_APIKEY') print('Get your API key from https://www.cymon.io/') sys.exit(1) if verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') if not no_cache: homedir = pwd.getpwuid(os.getuid()).pw_dir requests_cache.install_cache(homedir + '/.habu_requests_cache') url = 'https://www.cymon.io:443/api/nexus/v1/ip/{}/timeline/'.format(ip) headers = { 'Authorization': 'Token {}'.format(habucfg['CYMON_APIKEY']) } r = requests.get(url, headers=headers) if r.status_code not in [200, 404]: print('ERROR', r) return False if r.status_code == 404: print("Not Found") return False data = r.json() if pretty: output.write(pretty_print(data)) else: output.write(json.dumps(data, indent=4)) output.write('\n')
def _query_api(self): """ Get data from MTA Service api endpoint. :return: """ endpoint = "http://web.mta.info/status/serviceStatus.txt" requests_cache.install_cache('transit-cache', backend='sqlite', expire_after=180) raw_xml_data = requests.get(endpoint).text data = xmltodict.parse(raw_xml_data, dict_constructor=dict) response_code = data['service']['responsecode'] if int(response_code) == 0: payload = { 'Subway': {"name": "Subway", "status": self._parse_transit(data['service']['subway'])}, 'MTA Buses': {"name": "MTA Buses", "status": self._parse_transit(data['service']['bus'])}, 'Bridges & Tunnels': {"name": 'Bridges & Tunnels', "status": self._parse_transit(data['service']['BT'])}, 'LIRR': {"name": 'LIRR', "status": self._parse_transit(data['service']['LIRR'])}, 'Metro North': {"name": 'Metro North', "status": self._parse_transit(data['service']['MetroNorth'])}, } else: payload = None #TODO: Raise a warning that response code was non-zero return payload
def test_expire_after_installed(self): requests_cache.install_cache(name=CACHE_NAME, backend=CACHE_BACKEND) requests_cache.expire_after('http://httpbin.org/get', 2) r = requests.get('http://httpbin.org/get') self.assertFalse(r.from_cache) r = requests.get('http://httpbin.org/get') self.assertTrue(r.from_cache)
def run(self, cache=True): """Run application.""" self._query() # configure `requests` cache if cache: cache_dir = appdirs.user_cache_dir('craigslist') os.makedirs(cache_dir, exist_ok=True) requests_cache.install_cache( cache_name=os.path.join(cache_dir, 'craigslist'), expire_after=timedelta(hours=0.5)) print('Running query...\n') # record the start time start = time.time() self.prices = self._getprices() # determine elapsed time of queries self.duration = time.time() - start # remove expired cache entries if cache: requests_cache.core.remove_expired_responses() # print statistics (if any price data exists) if self.prices: self._print() else: print('Nothing found for that search.')
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None): """Performs a start/stop/restart/status/scale on an instance :param command: String of start, stop, restart, status or scale :param service: service name :param instance: instance name, like "main" or "canary" :param cluster: cluster name :param verbose: bool if the output should be verbose or not :returns: A unix-style return code """ marathon_config = marathon_tools.load_marathon_config() job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir) if not app_id: try: app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id'] except NoDockerImageError: job_id = compose_job_id(service, instance) print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id return 1 normal_instance_count = job_config.get_instances() normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance) proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir) client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) if command == 'start': start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'stop': stop_marathon_job(service, instance, app_id, client, cluster) elif command == 'restart': restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster) elif command == 'status': # Setting up transparent cache for http API calls requests_cache.install_cache('paasta_serviceinit', backend='memory') print status_desired_state(service, instance, client, job_config) print status_marathon_job(service, instance, app_id, normal_instance_count, client) tasks, out = status_marathon_job_verbose(service, instance, client) if verbose: print out print status_mesos_tasks(service, instance, normal_instance_count) if verbose: print status_mesos_tasks_verbose(app_id, get_short_task_id) if proxy_port is not None: print status_smartstack_backends( service=service, instance=instance, cluster=cluster, job_config=job_config, tasks=tasks, expected_count=normal_smartstack_count, soa_dir=soa_dir, verbose=verbose, ) elif command == 'scale': scale_marathon_job(service, instance, app_id, delta, client, cluster) else: # The command parser shouldn't have let us get this far... raise NotImplementedError("Command %s is not implemented!" % command) return 0
def install_cache(expire_after=12 * 3600): """ Patches the requests library with requests_cache. """ requests_cache.install_cache( expire_after=expire_after, allowable_methods=('GET',))
def main(): requests_cache.install_cache("british-library-catalog") session = requests_cache.CachedSession() session.hooks = {'response': make_throttle_hook(0.5)} # Be polite - less than 2 req/sec columns = '\t'.join(['Print ID', 'Scan ID', 'DOM IDs']) if FETCH_ARKS: columns += '\tARKs' print(columns) with open('metadata/booklist.tsv') as infile: for line in infile: if line.startswith('Aleph'): #skip header continue line = line.rstrip('\n') digitalId = line.split('\t')[0] originalId = getPrintId(session, digitalId) # This (disabled) section of code will translate lsid to ARK # in case we ever need it. Right now the BL Viewer accepts raw # lsids, so it's unnecessary lsids = line.split('\t')[-1].split(' -- ') arks = [] if FETCH_ARKS: for lsid in lsids: ark = getARK(session, lsid) arks.append(ark) print('\t'.join([str(originalId), digitalId,','.join(lsids),','.join(arks)]))
def install_cache_requests(): requests_cache.install_cache(**{ 'allowable_methods': ('GET', 'HEAD'), 'cache_name': conf.REQUESTS_CACHE, 'backend': 'sqlite', 'fast_save': conf.ASYNC_CACHE_WRITES, 'extension': '.sqlite3'})
def __init__(self, api_key, response_format='json'): super(OMIM, self).__init__() self.base_url = 'http://api.omim.org/api' self.format = response_format self.api_key = api_key requests_cache.install_cache('omim_cache', backend='sqlite', expire_after=8460000)
def get_keyboard_data(keyboardID, weekCache=False): """ Get Keyboard or package data from web api. Args: keyboardID (str): Keyboard or package ID weekCache (bool) : cache data for 1 week, default is 1 day Returns: dict: Keyboard data """ logging.info("Getting data for keyboard %s", keyboardID) api_url = "https://api.keyman.com/keyboard/" + keyboardID logging.debug("At URL %s", api_url) home = str(Path.home()) cache_dir = keyman_cache_dir() current_dir = os.getcwd() if weekCache: expire_after = datetime.timedelta(days=7) else: expire_after = datetime.timedelta(days=1) os.chdir(cache_dir) requests_cache.install_cache(cache_name='keyman_cache', backend='sqlite', expire_after=expire_after) now = time.ctime(int(time.time())) response = requests.get(api_url) logging.debug("Time: {0} / Used Cache: {1}".format(now, response.from_cache)) os.chdir(current_dir) requests_cache.core.uninstall_cache() if response.status_code == 200: return response.json() else: return None
def main(argv=None): args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir # Exit on exceptions while loading settings settings.cluster = load_system_paasta_config().get_cluster() marathon_config = marathon_tools.load_marathon_config() settings.marathon_client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password() ) # Set up transparent cache for http API calls. With expire_after, responses # are removed only when the same request is made. Expired storage is not a # concern here. Thus remove_expired_responses is not needed. requests_cache.install_cache("paasta-api", backend="memory", expire_after=30) server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: server.serve_forever() except KeyboardInterrupt: sys.exit(0)
def handle(self, *args, **options): if options['cached']: requests_cache.install_cache('resources_import') importers = get_importers() imp_list = ', '.join(sorted(importers.keys())) imp_name = options.get('module') if not imp_name: raise CommandError("Enter the name of the importer module. Valid importers: %s" % imp_list) if imp_name not in importers: raise CommandError("Importer %s not found. Valid importers: %s" % (args[0], imp_list)) imp_class = importers[imp_name] importer = imp_class(options) # Activate the default language for the duration of the import # to make sure translated fields are populated correctly. default_language = settings.LANGUAGES[0][0] for imp_type in self.importer_types: name = "import_%s" % imp_type method = getattr(importer, name, None) if options[imp_type]: if not method: raise CommandError("Importer %s does not support importing %s" % (name, imp_type)) else: if not options['all']: continue if method: with override(default_language), transaction.atomic(): kwargs = {} url = options.pop('url', None) if url: kwargs['url'] = url method(**kwargs)
def cli(debug): log_level = logging.INFO requests_cache.install_cache('fr_cache', expire_after=60*60*24*3) # 3 days if debug: log_level = logging.DEBUG sys.excepthook = lambda t, v, tb: ipdb.post_mortem(tb) coloredlogs.install(level=log_level, fmt="%(levelname)s %(message)s")
pass import os import re import sys import requests import getpass import json try: import requests_cache except ImportError: print("no cache") else: requests_cache.install_cache("gh_api") # Keyring stores passwords by a 'username', but we're not storing a username and # password fake_username = '******' class Obj(dict): """Dictionary with attribute access to names.""" def __getattr__(self, name): try: return self[name] except KeyError: raise AttributeError(name) def __setattr__(self, name, val):
# -*- coding: utf-8 -*- import json import math import os import random import re import time import urllib.parse import requests import requests_cache from bs4 import BeautifulSoup from fuzzywuzzy import fuzz, process requests_cache.install_cache('worldstate_cache', expire_after=60) def get_worldstate(): """Get world state json. Return a very complicated nested array """ wsurl = 'http://content.warframe.com/dynamic/worldState.php' ws = requests.get(wsurl, timeout=30).json() return ws data_files = { 'solNodes.json': 'S', 'languages.json': 'L',
def license_check(f): requests_cache.install_cache('github_cache', backend='sqlite', expire_after=3600000) token = get_token() list_all_repos = all_repositories(token, f) list_repos_without_license_file = [] list_without_licence = [] for every_repo in list_all_repos: print('REPO: ', every_repo, file=f) contents_url = json_parsing( 'https://api.github.com/repos/apiaryio/' + every_repo + '/contents', token, f) list_licenses_compare = [] found_license_file = False found_license = False for filename in contents_url: if re_findall(filename['name'], r'\blicen[sc]e[sd]?'): found_license_file = True r = search_license_type_in_license_file(filename, token, f) list_licenses_compare.append((filename['name'], r)) print(filename['name'], ': ', r, file=f) if r: found_license = True if re_findall(filename['name'], r'\breadme') or filename['name'] == 'package.json': v = search_license_type_in_readme_packagejson( filename, token, f) list_licenses_compare.append((filename['name'], v)) print(filename['name'], ': ', v, file=f) if v: found_license = True package_json_dependencies(every_repo, f, filename, token) if not found_license_file: list_repos_without_license_file.append(every_repo) if not found_license: list_without_licence.append(every_repo) compare_pull_requests_and_master(every_repo, f, list_licenses_compare, token) print('---------------------------------------', file=f) print() print('REPOSITORIES TOTAL: {}'.format(len(list_all_repos))) print('REPOSITORIES TOTAL: {}'.format(len(list_all_repos)), file=f) print() print( 'REPOSITORIES WITHOUT LICENSE FILE: {}'.format( len(list_repos_without_license_file)), ':', list_repos_without_license_file) print('REPOSITORIES WITHOUT LICENSE FILE: {}'.format( len(list_repos_without_license_file)), ':', list_repos_without_license_file, file=f) print() print('REPOSITORIES WITHOUT LICENSE: {}'.format(len(list_without_licence)), ':', list_without_licence) print('REPOSITORIES WITHOUT LICENSE: {}'.format(len(list_without_licence)), ':', list_without_licence, file=f) print() f.close()
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_aus as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from parse import parse from custom_browser import CustomDriver # Init variables and assets shop_id = 'liquor_land' root_url = 'https://www.liquorland.com.au' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'AUS' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=False) def getprice(pricestr): if not pricestr: return pricestr = pricestr.replace('$', '') price = parse('{pound:d}', pricestr) if price: return price.named['pound'] * 100 price = parse('{pound:d}.{pence:d}', pricestr) if price: return price.named['pound'] * 100 + price.named['pence']
from collections import OrderedDict from http.cookiejar import LWPCookieJar from http.cookiejar import Cookie import platform import time import requests import requests_cache from config import Config from const import Constant from storage import Storage from encrypt import encrypted_request import logger requests_cache.install_cache(Constant.cache_path, expire_after=3600) log = logger.getLogger(__name__) # 歌曲榜单地址 TOP_LIST_ALL = { 0: ["云音乐新歌榜", "3779629"], 1: ["云音乐热歌榜", "3778678"], 2: ["网易原创歌曲榜", "2884035"], 3: ["云音乐飙升榜", "19723756"], 4: ["云音乐电音榜", "10520166"], 5: ["UK排行榜周榜", "180106"], 6: ["美国Billboard周榜", "60198"], 7: ["KTV嗨榜", "21845217"], 8: ["iTunes榜", "11641012"], 9: ["Hit FM Top榜", "120001"],
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/>. from BTG import BTG from lib.io import display from requests import get from re import findall import config import os from platform import system if system() != "Windows": import requests_cache requests_cache.install_cache('%sBTG' % config.sqlite_path) class Malekal: """ This module allow you to search IOC in Malekal website (HTTP Requests) or local directory specified in BTG configuration file. """ def __init__(self, ioc, type): if config.malekal_enabled: self.module_name = __name__.split(".")[1] if config.malekal_local and not config.malekal_remote: self.types = ["MD5"] else: self.types = [ "MD5", "SHA1", "SHA256", "SHA512", "URL", "IPv4", "IPv6",
import io import re import pandas as pd import requests import requests_cache KEGG_HOST = 'http://rest.kegg.jp' KEGG_PATHWAY_URL = '/list/pathway/{species}' KEGG_GENE_URL = '/list/{species}' KEGG_MAPPING_URL = '/link/pathway/{species}' KEGG_CONV_URL = '/conv/ncbi-geneid/{species}' requests_cache.install_cache('_kegg_cache') def get_gene_sets(species, use_kegg_ids=False, use_name=True): """Gets mapping from gene ids to pathways as genesets. Args: species (str): Name of the species to query. Use or example 'hsa' for human or 'mmu' for mouse. use_kegg_ids (bool): Whether to return gene ids as entrez ids (False), or as KEGG gene ids (True). use_name (bool): Whether to use the names of the pathways in the geneset dict (True), or the pathway ids (False). Returns: dict of sets: Dict mapping pathways (the keys) to gene ids (the sets).
from collections import Counter from collections import OrderedDict import csv import functools from functools import reduce from joblib import Parallel, delayed import json import multiprocessing from networkx import * import operator import pandas as pd import requests import requests_cache pd.options.mode.chained_assignment = None requests_cache.install_cache('demo_cache') def Mass2Motif_2_Network(edges, motifs, prob=0.01, overlap=0.3, top=5): """Map Mass2Motifs onto a mass spectral molecular network :param edges: An edges file downloaded from GNPS :type edges: pandas.core.frame.DataFrame :param motifs: A motif summary file downloaded from MS2LDA :type motifs: pandas.core.frame.DataFrame :param prob: Minimal probability score for a Mass2Motif to be included :type prob: float :param overlap: Minimal overlap score for a Mass2Motif to be included :type overlap: float :param top: Specifies how many most shared motifs per molecular family (network component index) should be shown :type top: int
def install_cache(): requests_cache.install_cache('pbr_mlb')
import sys from Orangeboard import Orangeboard from BioNetExpander import BioNetExpander from QueryNCBIeUtils import QueryNCBIeUtils from QuerySciGraph import QuerySciGraph from QueryDisont import QueryDisont from ParsePhenont import ParsePhenont from QueryChEMBL import QueryChEMBL from QueryPubChem import QueryPubChem import pandas import timeit import argparse # configure requests package to use the "orangeboard.sqlite" cache requests_cache.install_cache('orangeboard') # create an Orangeboard object ob = Orangeboard(debug=True) # configure the Orangeboard for Neo4j connectivity ob.neo4j_set_url() ob.neo4j_set_auth() bne = BioNetExpander(ob) def add_pc2_to_kg(): sif_data = pandas.read_csv( '../../../data/pc2/PathwayCommons9.All.hgnc.sif', sep='\t',
def enable_cache(): if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) requests_cache.install_cache(CACHE_FILE)
from django.core.management.base import BaseCommand, CommandError from pokesearch.models import Pokemon, PokemonSpecies, Type, PokemonType import requests import requests_cache requests_cache.install_cache('pokesearch_cache') class Command(BaseCommand): help = "Get the list of Pokemons from PokeAPI" def add_arguments(self, parser): parser.add_argument("-f", "--from", type=int, default=1) parser.add_argument("-t", "--to", type=int, default=899) def handle(self, *args, **options): _from = options["from"] _to = options["to"] for pokemon_id in range(_from, _to): pokemon = requests.get( f"https://pokeapi.co/api/v2/pokemon/{pokemon_id}") pokemon_obj, pokemon_created = Pokemon.objects.get_or_create( id=pokemon.json()["id"], name=pokemon.json()["name"]) species = requests.get(pokemon.json()["species"]['url']) species_obj, species_created = PokemonSpecies.objects.get_or_create( id=species.json()["id"], name=species.json()["name"]) pokemon_obj.species = species_obj for pokemon_type in pokemon.json()["types"]: type_ = requests.get(pokemon_type["type"]["url"]) type_obj, type_created = Type.objects.get_or_create(
from flask import Flask, render_template, request, jsonify import plotly.graph_objs as go from plotly.utils import PlotlyJSONEncoder import json import requests import requests_cache requests_cache.install_cache('crime_api_cache', backend='sqlite', expire_after=36000) app = Flask(__name__) crime_url_template = 'https://data.police.uk/api/crimes-street/all-crime?lat={lat}&lng={lng}&date={data}' categories_url_template = 'https://data.police.uk/api/crime-categories?date={date}' @app.route('/crimestat', methods=['GET']) def crimechart(): my_latitude = request.args.get('lat', '51.52369') my_longitude = request.args.get('lng', '-0.0395857') my_date = request.args.get('date', '2018-11') categories_url_template = ' https://data.police.uk/api/crime-categories?date={date} ' resp = requests.get(categories_url_template.format(date=my_date)) if resp.ok: categories_json = resp.json() else: print(resp.reasone) categories = {categ["url"]: categ["name"] for categ in categories_json} crime_category_stats = dict.fromkeys(categories.keys(), 0)
from json import loads from datetime import datetime from requests import get from requests_cache import install_cache from .constants import VALID_POSITIONS, BASE_URL, ONE_HOUR install_cache('nfl_api_cache', expire_after=ONE_HOUR) def gather_json(week=None, season=None, position=None): nfl_api_url = _format_url(week, season, position) try: response = get(nfl_api_url) return loads(response.text)['players'] except: raise Exception('Error retrieving data from NFL api') def _format_url(week, season, position): type = 'weekStats' if not season: season = _get_default_season() if not week: type = 'seasonStats' week_string = '' else: week_string = '&week={}'.format(week) nfl_api_url = BASE_URL.format(type, season, week_string) if position in VALID_POSITIONS:
from flask import Flask, jsonify, request, make_response import requests from app.post import Post, PostSchema from operator import attrgetter from pprint import pprint import time, json from threading import Thread import queue import requests_cache app = Flask(__name__) requests_cache.install_cache('api_cache', backend='sqlite', expire_after=2 * 60) # Route 1 @app.route('/api/ping', methods=['GET']) def ping(): return jsonify(success=True), 200 # Route 2 @app.route('/api/posts', methods=['GET']) def get_posts(): # Get URL arguements to pass along tags = request.args.get('tags') # Set default if arg is None sortBy = request.args.get('sortBy') or 'id'
def set_request_cache(): if not os.path.exists('_cache'): os.mkdir('_cache') requests_cache.install_cache('_cache/page_cache', backend='sqlite', expire_after=10800)
import requests import requests_cache requests_cache.install_cache('m2m_cache', expire_after=86400) class MachineToMachine(object): def __init__(self, base_url, api_user, api_key): self.base_url = base_url self.api_user = api_user self.api_key = api_key self.inv_url = self.base_url + '/api/m2m/12576/sensor/inv' cache_name = 'm2m_%s_cache' % base_url.replace('https://', '') requests_cache.install_cache(cache_name, expire_after=86400) def toc(self): url = self.inv_url + '/toc' return requests.get(url, auth=(self.api_user, self.api_key)).json() def node_inventory(self, subsite, node): url = '/'.join((self.inv_url, subsite, node)) return [ '-'.join((subsite, node, sensor)) for sensor in requests.get(url, auth=(self.api_user, self.api_key)).json() ] def streams(self): toc = self.toc() stream_map = {}
stores=[1966,1923, 1857,1886,1871,1767,1838,1790,1823,1917] import requests import requests_cache import json from pprint import pprint requests_cache.install_cache('datagram') url = "https://datagram-products-v1.p.mashape.com/stores/1870/products/"#1871 headers = { 'x-mashape-key': "vJpcBxsOd0mshQhiA5WzWt780Qx0p1ZR1vzjsnl3zHA9dKPcuf", 'accept': "application/json", 'cache-control': "no-cache", 'postman-token': "6d56d562-8fa1-fa04-da55-42ad69a4a23e" } def get_chains(): url = "https://datagram-products-v1.p.mashape.com/chains/" r = requests.get( url, headers=headers) return json.loads(r.text) def get_stores(chain): url = "https://datagram-products-v1.p.mashape.com/chains/"+str(str(c['id']))+"/stores/" r = requests.get(url, headers=headers) res=json.loads(r.text) if r.status_code!=200: print(r.text) print(r.from_cache,r.status_code,chain['name'],len(res)) return res
def main(): args = parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) instances = [] return_codes = [] command = args.command if (args.service_instance): service_instance = args.service_instance service, instance, _, __ = decompose_job_id(service_instance) instances.append(instance) elif (args.service and args.instances): service = args.service instances = args.instances.split(',') else: log.error("The name of service or the name of instance to inspect is missing. Exiting.") sys.exit(1) # Setting up transparent cache for http API calls requests_cache.install_cache("paasta_serviceinit", backend="memory") cluster = load_system_paasta_config().get_cluster() actual_deployments = get_actual_deployments(service, args.soa_dir) clients = PaastaClients(cached=(command == 'status')) instance_types = ['marathon', 'chronos', 'paasta_native', 'adhoc'] instance_types_map = {it: [] for it in instance_types} for instance in instances: try: instance_type = validate_service_instance( service, instance, cluster, args.soa_dir, ) except Exception: log.error( ( 'Exception raised while looking at service %s instance %s:' ).format(service, instance), ) log.error(traceback.format_exc()) return_codes.append(1) continue if instance_type not in instance_types: log.error( ( "I calculated an instance_type of {} for {} which I don't " "know how to handle." ).format( instance_type, compose_job_id(service, instance), ), ) return_codes.append(1) else: instance_types_map[instance_type].append(instance) remote_run_frameworks = None if len(instance_types_map['adhoc']) > 0: remote_run_frameworks = paasta_remote_run.remote_run_frameworks() for instance_type in instance_types: for instance in instance_types_map[instance_type]: try: version = get_deployment_version( actual_deployments, cluster, instance, ) paasta_print('instance: %s' % PaastaColors.blue(instance)) paasta_print('Git sha: %s (desired)' % version) if instance_type == 'marathon': return_code = marathon_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, app_id=args.app_id, delta=args.delta, client=clients.marathon(), ) elif instance_type == 'chronos': return_code = chronos_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, client=clients.chronos(), ) elif instance_type == 'paasta_native': return_code = paasta_native_serviceinit.perform_command( command=command, service=service, instance=instance, cluster=cluster, verbose=args.verbose, soa_dir=args.soa_dir, ) elif instance_type == 'adhoc': if instance == 'interactive': raise NotImplementedError if command != 'status': raise NotImplementedError paasta_remote_run.remote_run_list_report( service=service, instance=instance, cluster=cluster, frameworks=remote_run_frameworks, ) return_code = 0 except Exception: log.error( ( 'Exception raised while looking at service {} ' 'instance {}:' ).format(service, instance), ) log.error(traceback.format_exc()) return_code = 1 return_codes.append(return_code) sys.exit(max(return_codes))
from feedparser.feedFetch import feedToJSON import requests_cache import time import concurrent.futures from dateutil.parser import parse import threading import configs as cf # to use configs.json from external source # import requests requests_cache.install_cache('feedscache', backend='sqlite', expire_after=1200) # using configs from configs.py configs = cf.configs ########### # using configs.json from external source # r = requests.get("CONFIGS.JSON URL") # configs = r.json() urllist = configs['sources'].values() # feedList refreshed every 660 seconds newlist = [] # final feed List (updates with newlist) finallist = [] def getTimestamp(pubdate):
import apis import json import requests import requests_cache import settings requests_cache.install_cache(cache_name='hive_api', backend='sqlite', expire_after=60) def get_hive_sessionId(): payload = "{\r\n \"sessions\": [{\r\n \"username\": \"" + apis.username + "\",\r\n \"password\": \"" + apis.password + "\",\r\n \"caller\": \"WEB\"\r\n }]\r\n}" headers = { 'Content-Type': "application/json", 'Accept': "application/vnd.alertme.zoo-6.1+json", 'X-Omnia-Client': "Hive Web Dashboard", 'User-Agent': "PostmanRuntime/7.18.0", 'Cache-Control': "no-cache", 'Host': "api.prod.bgchprod.info:443", 'Accept-Encoding': "gzip, deflate", 'Content-Length': "139", 'Connection': "keep-alive", 'cache-control': "no-cache" } response = requests.request("POST", apis.hive_login_url, data=payload, headers=headers)
app = Flask(__name__) Markdown(app) # some links for jenkins URL_METADATA = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/maven-metadata.xml" URL_VERSION_INFO = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/{version}/{module}-{version}.pom" URL_JAR_DOWNLOAD = "https://repo.codemc.io/repository/maven-releases/world/bentobox/{module}/{version}/{module}-{version}.jar" # bentobox static addon list BENTOBOX_ADDONS = open('addons.txt', 'r').read().splitlines() CACHE_FILE_SECONDS = 60 * 10 requests_cache.install_cache('nexus_cache', backend='sqlite', expire_after=CACHE_FILE_SECONDS) mongodb = pymongo.MongoClient(os.environ["MONGODB_URI"])["bmj0hz1bfryjijw"] @app.route('/') def index(): return render_template('index.html', addons=dict( map(lambda e: (e["artifactId"], e["version"]), get_valid_addons()))) @app.route('/custom') def custom():
import sys from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup import requests import requests_cache cachefile = "summaries" requests_cache.install_cache(cachefile) root_url = sys.argv[1] nextpageurl = root_url while nextpageurl is not None: r = requests.get(nextpageurl) soup = BeautifulSoup(r.text, 'html5lib') for link in soup.findAll('a'): thislink = link.get('href') thisurl = urljoin(root_url, thislink) print(thisurl) nextpageurl = None for item in soup.findAll('li', {'class': 'next'}):
import unittest import os import sys import pprint import lxml import requests_cache sys.path.insert( 0, os.path.join(os.path.dirname(os.path.dirname(__file__)), "yyy")) import scrapers requests_cache.install_cache("tests_cached_requests", expire_after=60 * 60 * 24) class BaseTest(unittest.TestCase): pass class TestScrapers(BaseTest): """Groups scrapes tests. The individual tests are added dynamically""" pass for name, Scraper in scrapers.all_scrapers.items(): def wrapper(name: str, Scraper: type): def test(self): scraper = Scraper()
def scrape_albums(genre_name, genre_id): req = requests.Session() requests_cache.install_cache('allmusic') headers = { 'referer': 'http://www.allmusic.com/advanced-search', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36' } dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = ( 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87' ) #payload = {'filters[]': 'subgenreid:MA0000002451', 'sort': ''} #link = 'http://www.allmusic.com/advanced-search/results/{0}' albums = [] albums_mood = [] albums_url = [] artists = [] artists_url = [] rating = [] years = [] item_ids = [] page_no = 0 album_num = 0 print('Start Scraping {} ...'.format(genre_name)) payload = {'filters[]': genre_id, 'sort': ''} link = 'http://www.allmusic.com/advanced-search/results/{0}' while True: print('page no', page_no) site = req.post(link.format(str(page_no) if page_no > 0 else ''), data=payload, headers=headers).text if 'desktop-results' not in site: print('nothing for page number', page_no) break if 'http://www.allmusic.com/album/' not in site: print('nothing for page number', page_no) break page_no += 1 table = site.split('<tbody>')[1].split('/tbody>')[0] for row in tqdm(table.split('<tr>')[1:]): album = row.split('"title">', 1)[1].split('">', 1)[1].split('</a', 1)[0] albums.append(album) album_url = row.split('"title">', 1)[1].split( '">', 1)[0].split('<a ')[1].split('="', 1)[1] albums_url.append(album_url) while True: try: client = webdriver.PhantomJS(desired_capabilities=dcap) client.get(album_url) page = client.page_source client.quit() break except: print('Re-connect to {}'.format(album_url)) time.sleep(1) soup = bs(page, "lxml") # Moods moods = [] try: for mood in soup.findAll('section', {"class": "moods"})[0].find_all('a'): moods.append(mood.text) except: moods.append('None') albums_mood.append(moods) # Year try: year = row.split('class="year">')[1].split('</td', 1)[0].strip() years.append(year) except: print(album) years.append('None') # Artist try: artist = row.split('artist">')[1].split( '</td', 1)[0].strip().split('">', 1)[1].split('</a', 1)[0] artists.append(artist) except: print(album, year) artists.append('Various Artists') # Artist URL try: artist_url = row.split('artist">')[1].split( '</td', 1)[0].strip().split('">', 1)[0].split('<a ', 1)[1].split('="', 1)[1] artists_url.append(artist_url) except: print(album, year) artists_url.append('None') time.sleep(1) album_num += 1 print('Done') print('{0} albums under {1}'.format(album_num, genre_name)) df = pd.DataFrame({ 'album': albums, 'artist': artists, 'year': years, 'album_mood': albums_mood, 'album_url': albums_url, 'artist_url': artists_url }) file_name = "_".join(genre_name.lower().split()) df.to_csv('data/{}.csv'.format(file_name)) print('Done. Saved to data/{}.csv'.format(file_name))
"""Functions for GitHub API requests.""" import getpass import json import os import re import sys import requests try: import requests_cache except ImportError: print("no cache", file=sys.stderr) else: requests_cache.install_cache("gh_api", expire_after=3600) # Keyring stores passwords by a 'username', but we're not storing a username and # password fake_username = '******' class Obj(dict): """Dictionary with attribute access to names.""" def __getattr__(self, name): try: return self[name] except KeyError: raise AttributeError(name) def __setattr__(self, name, val):
# alias btc="python $HOME/crypto-scripts/wci.py" # # Start using: # $ source ~/.bashrc # $ btc import json import requests import requests_cache import os import sys from si_prefix import si_format os.chdir(os.path.dirname(__file__)) requests_cache.install_cache('test_cache', backend='sqlite', expire_after=5 * 60) url = ('https://www.worldcoinindex.com/apiservice/getmarkets' + '?key=%(WORLD_COIN_INDEX_API_KEY)s&fiat=USD') % os.environ max_items = int(os.environ.get('WORLD_COIN_INDEX_MAX_ITEMS', "40")) ticker = requests.get(url).json() if not 'Markets' in ticker: print ticker sys.exit(1) data = ticker['Markets'][0] data = sorted(data, key=lambda item: -item['Volume_24h'])
def cmd_crtsh(domain, no_cache, no_validate, verbose): """Downloads the certificate transparency logs for a domain and check with DNS queries if each subdomain exists. Uses multithreading to improve the performance of the DNS queries. Example: \b $ sudo habu.crtsh securetia.com [ "karma.securetia.com.", "www.securetia.com." ] """ if verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') if not no_cache: homedir = Path(os.path.expanduser('~')) requests_cache.install_cache(str((homedir / '.habu_requests_cache')), expire_after=3600) subdomains = set() if verbose: print("Downloading subdomain list from https://crt.sh ...", file=sys.stderr) req = requests.get("https://crt.sh/?q=%.{d}&output=json".format(d=domain)) if req.status_code != 200: print("[X] Information not available!") exit(1) json_data = json.loads(req.text) for data in json_data: name = data['name_value'].lower() if '*' not in name: subdomains.add(name) subdomains = list(subdomains) if no_validate: print(json.dumps(sorted(subdomains), indent=4)) return True if verbose: print("Validating subdomains against DNS servers ...", file=sys.stderr) answers = query_bulk(subdomains) validated = [] for answer in answers: if answer: validated.append(str(answer.qname)) print(json.dumps(sorted(validated), indent=4)) return True
import alexa from bs4 import BeautifulSoup import requests import requests_cache import json # Using cache requests_cache.install_cache('cache') JS_STATS_FILE = 'js_stats.json' X_XSS_STATS_FILE = 'x_xss_stats.json' RANK_FILE = 'rank.json' EXTERNAL_JAVASCRIPTS = [ 'jquery', 'react', 'bootstrap', 'angular', 'moment', 'socket.io', 'ember', 'backbone', 'reveal', 'underscore', 'lodash', 'mocha', 'meteor', 'mercury', 'dojo', 'ext-core', 'hammer', 'mootools', 'prototype', 'scriptaculous', 'swfobject', 'three', 'webfont' ] def parse_websites(websites): js_stats = {} x_xss_stats = {} for website in websites: print(website[1]) try: response = requests.get('http://' + website[1]) soup = BeautifulSoup(response.text, 'lxml') # Extracting script file sources
def __init__(self, headers=None, cookies=None, cache_name=None, delay=1, expire_hours=12, as_string=False): ''' Base class for common scraping tasks Args: headers: dict of headers cookies: cookiejar object cache_name: should be full path delay: int (be polite!!!) expire_hours: int - default 4 as_string: get string rather than parsed json ''' logging.getLogger(__name__).addHandler(logging.NullHandler()) if not cookies: try: import cookielib cookies = cookielib.MozillaCookieJar() except (NameError, ImportError) as e: try: import http.cookiejar cookies = http.cookiejar.MozillaCookieJar() except Exception as e: pass _s = requests.Session() _s.cookies = cookies if headers: _s.headers.update(headers) else: _s.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' }) if cache_name: if not '/' in cache_name: cache_name = os.path.join('/tmp', cache_name) try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount( 'http://', CacheControlAdapter( cache=FileCache(cache_name), cache_etags=False, heuristic=ExpiresAfter(hours=expire_hours))) except ImportError as e: try: import requests_cache requests_cache.install_cache(cache_name) except: pass self.s = _s self.urls = [] self.as_string = as_string if delay > 0: self.delay = delay else: self.delay = None