def fetch_webpage_text(url, use_cache=True): #set session session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) #use session to retrive data #site= "https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/Black%20Lives%20Matter" hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } url = url.encode('utf-8') if use_cache and cache.contains(url): return cache.get(url) req = session.get(url, headers=hdr) #{'User-Agent': 'Mozilla/5.0'} content = req.text cache.put(url, content) time.sleep(1) return content
def fetch_webpage(url, use_cache=True): key = cache.md5_key(url) if use_cache and cache.contains(key): return cache.get(key) content = requests.get(url).text cache.put(key,content) return content
def fetch_webpage_text(url, use_cache=True): if use_cache and cache.contains(url): return cache.get(url) # cache miss, download it content = requests.get(url).text cache.put(url, content) return content
def fetch_webpage_text(url,use_cache=True): if use_cache and cache.contains(url): return cache.get(url) # cache miss, download it content = requests.get(url).text cache.put(url,content) return content
def fetch_webpage_text(url, use_cache=True): if use_cache and cache.contains(url): return cache.get(url) # if cache miss, download it and sleep one second to prevent too-frequent calls content = requests.get(url).text cache.put(url, content) time.sleep(1) return content
def fetch_webpage_text(url, use_cache=True): if use_cache and cache.contains(url): return cache.get(url) # if cache miss, download it and sleep one second to prevent too-frequent calls content = requests.get(url).text cache.put(url,content) time.sleep(1) return content
def get_friends(session=None, target=None): starget = str(target) if cache.contains(starget): return cache.get(starget) else: try: friends = session.friends.get(user_id=target)["items"] except vk_api.exceptions.ApiError: print("[!] ID " + str(target) + " is private. Skipping") return -1 cache.add(starget, friends) return friends
import random import string import cache def random_string(length): s = '' for i in range(length): s = s + random.choice(string.ascii_letters) return s cache.init() for n in range(1000): while True: key = random_string(20) if cache.contains(key): continue else: break value = random_string(20) cache.set(key, value) print("After {} iterations, cache has {} entries".format(n+1, cache.size())) print(key)
import cache config = ConfigParser.ConfigParser() config.read("settings.cfg") geonames_username = config.get("geonames","username") print "Talking to Geonames as "+geonames_username GEONAMES_API_URL = "http://api.geonames.org/searchJSON" print "Finding largest cities in country:" alpha3_to_city = {} for country in iso3166.countries: print " "+country.name cache_key = country.alpha3+"-geocode" results_text = None if cache.contains(cache_key): results_text = cache.get(cache_key) else: response = requests.get(GEONAMES_API_URL, params={ 'country':country.alpha2, 'q':country.name.split(",")[0], 'username':geonames_username}) results_text = response.content cache.put(cache_key,results_text) results = json.loads(results_text) try: cities = sorted([place for place in results['geonames'] if "PPL" in place['fcode']], key=itemgetter('population'),reverse=True) except KeyError: print "Error! Couldn't find an fcodes" continue if len(cities)>0:
# test_cache.py import random import string import cache def random_string(length): s = '' for i in range(length): s = s + random.choice(string.ascii_letters) return s cache.init() for n in range(1000): while True: key = random_string(20) if cache.contains(key): continue else: break value = random_string(20) cache.set(key, value) print("After {} iterations, cache has {} entries".format(n+1, cache.size()))
import cache config = ConfigParser.ConfigParser() config.read("settings.cfg") geonames_username = config.get("geonames", "username") print "Talking to Geonames as " + geonames_username GEONAMES_API_URL = "http://api.geonames.org/searchJSON" print "Finding largest cities in country:" alpha3_to_city = {} for country in iso3166.countries: print " " + country.name cache_key = country.alpha3 + "-geocode" results_text = None if cache.contains(cache_key): results_text = cache.get(cache_key) else: response = requests.get(GEONAMES_API_URL, params={ 'country': country.alpha2, 'q': country.name.split(",")[0], 'username': geonames_username }) results_text = response.content cache.put(cache_key, results_text) results = json.loads(results_text) try: cities = sorted([ place for place in results['geonames'] if "PPL" in place['fcode'] ],
4: 'duration', 5: 'summary', 6: 'date_posted' }.get(x, 0) # set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # let's scrape url = BASE_URL + START_PAGE logger.info("Scraping UFO reports from %s" % url) # first grab the index page if not cache.contains(url): index_page = requests.get(url) logger.debug("\tadded to cache from %s" % url) cache.put(url, index_page.text) content = cache.get(url) # now pull out all the links to songs dom = BeautifulSoup(content) #/html/body/p/table/tbody/tr[1]/td[1]/font/a link_tags = dom.select("td a") logger.debug("\tfound %d link tags" % len(link_tags)) links = set([tag['href'] for tag in link_tags]) # get all the unique urls logger.info("\tfound %d links to UFO shapes" % len(links))