def _get_raw_data(self): docktor_config = providers_config.providers['docktor'] apps = [] session = FuturesSession(max_workers=CONCURRENT_JOBS_LIMIT) session.mount('https://', self.__requests_http_adapter) session.mount('http://', self.__requests_http_adapter) for stage in docktor_config: for zone in docktor_config[stage]: apps_uri = '{uri}/apps/'.format(uri=docktor_config[stage][zone]['uri']) try: r = session.get(apps_uri, timeout=REQUEST_TIMEOUT).result() r.raise_for_status() apps_list = r.json() except ValueError as e: logger.error("Non json response {} from {}-{} docktor".format(r.content, stage, zone)) raise e except Exception as e: logger.error("Exception raised on {}-{} docktor".format(stage, zone)) raise e future_apps_details = [session.get('{apps_uri}{app}'.format(apps_uri=apps_uri, app=app), timeout=REQUEST_TIMEOUT) for app in apps_list] try: apps_details = [a.result() for a in future_apps_details] except Exception as e: logger.error("Exception raised on {}-{} docktor".format(stage, zone)) raise e partial_get_app_info = partial(self.get_app_info, stage, zone) apps.extend(map(lambda a: partial_get_app_info(a), apps_details)) return apps
class CustomStreamListener(tweepy.StreamListener): def __init__(self, socketio, track): super(CustomStreamListener, self).__init__() self.socketio = socketio self.room = track self.session = FuturesSession() def get_geonames_username(self): return "yasyf{}".format(random.randint(1,5)) def on_status(self, status): if status.coordinates or status.author.location: data = {'text': status.text.encode('utf-8')} data.update({k:getattr(status.author, k) for k in ['time_zone', 'location']}) data.update({k:getattr(status, k) for k in ['lang', 'coordinates']}) def add_sentiment(session, response): data['sentiment'] = response.json()['results'] self.socketio.emit('status', data, room=self.room) def add_country_code(session, response): try: json = response.json() if json['totalResultsCount'] > 0: result = json['geonames'][0] data['country'] = result['countryCode'] data['coordinates'] = {'coordinates': [float(result['lng']), float(result['lat'])]} else: return except: data['country'] = response.text.strip() if TEST_MODE: data['sentiment'] = random.random() self.socketio.emit('status', data, room=self.room) else: url = "http://apiv2.indico.io/sentiment" args = {'key': os.getenv('INDICOIO_API_KEY')} self.session.post(url, data={'data': data['text']}, params=args, background_callback=add_sentiment) if status.coordinates: url = "http://ws.geonames.org/countryCode" args = {'lat': status.coordinates['coordinates'][1], 'lng': status.coordinates['coordinates'][0], 'username': self.get_geonames_username()} self.session.get(url, params=args, background_callback=add_country_code) else: url = "http://api.geonames.org/search" args = {'q': status.author.location, 'username': self.get_geonames_username(), 'maxRows': 1, 'type': 'json'} self.session.get(url, params=args, background_callback=add_country_code) return True def on_error(self, status_code): print 'Encountered error with status code:', status_code self.socketio.emit('error', {'status_code': status_code}, room=self.room) return True def on_timeout(self): print 'Timeout...' return True
def add_list_new() -> None: requester = FuturesSession(executor=ProcessPoolExecutor(30), session=requests.session()) api_key = settings.TBA_API_HEADERS team_list_get = lambda p: requester.get(team_by_page_url_template(page=p), headers=api_key) team_participation_get = lambda tn: requester.get(team_participation_url_template(team=tn), headers=api_key) page_range = get_page_range() print("\nStarting %d HTTP requests for team lists, split between %d processes..." % ( page_range[1] - page_range[0], requester.executor._max_workers)) team_list_futures = [team_list_get(p) for p in range(*page_range)] print("Waiting...") wait(team_list_futures) print("Done!\n") teams_lists = map(lambda f: f.result().json(), team_list_futures) teams_data = [item for page_data in teams_lists for item in page_data] team_numbers = [*map(lambda t: t['team_number'], teams_data)] print("Starting %d HTTP requests for team participation data, split between %d processes..." % ( len(team_numbers), requester.executor._max_workers)) team_participation_futures = [team_participation_get(tn) for tn in team_numbers] print("Waiting...") wait(team_participation_futures) print("Done!\n") team_participations = map(lambda f: f.result().json(), team_participation_futures) arg_list = zip(team_numbers, teams_data, team_participations) for args in arg_list: add_team(*args)
def test_futures_session(self): # basic futures get sess = FuturesSession() future = sess.get(httpbin('get')) self.assertIsInstance(future, Future) resp = future.result() self.assertIsInstance(resp, Response) self.assertEqual(200, resp.status_code) # non-200, 404 future = sess.get(httpbin('status/404')) resp = future.result() self.assertEqual(404, resp.status_code) def cb(s, r): self.assertIsInstance(s, FuturesSession) self.assertIsInstance(r, Response) # add the parsed json data to the response r.data = r.json() future = sess.get(httpbin('get'), background_callback=cb) # this should block until complete resp = future.result() self.assertEqual(200, resp.status_code) # make sure the callback was invoked self.assertTrue(hasattr(resp, 'data')) def rasing_cb(s, r): raise Exception('boom') future = sess.get(httpbin('get'), background_callback=rasing_cb) with self.assertRaises(Exception) as cm: resp = future.result() self.assertEqual('boom', cm.exception.args[0])
def bench_requests_futures_async(number_reqs, nb_worker): # https://pypi.python.org/pypi/requests-futures l=[] start = datetime.datetime.now() print('Start : ', start) def bg_cb(sess, resp): # resp.text if resp.status_code != requests.codes.ok: print(resp.status_code) resp.raise_for_status() #print(dir(resp)) l.append(1) l_size = len(l) print(l_size) #print(len(response.body)) if l_size == number_reqs: tornado.ioloop.IOLoop.instance().stop() if datetime.datetime.now() - start == 60: tornado.ioloop.IOLoop.instance().stop() session = FuturesSession( max_workers=10 ) for elem in range(int(number_reqs/nb_worker)): for e in range(nb_worker): session.get( "http://www.leboncoin.fr/", background_callback = bg_cb ) time.sleep(1) print('[Rq TURFU] Done :', datetime.datetime.now() - start)
def fetchReviews(unique_id): s = FuturesSession() # Hand shake proc. to figure out how many calls we send to server api_format = 'https://watcha.net/comment/list?unique_id={unique_id}&start_index={start_index}&count=10&type=like' handshake = api_format.format(unique_id=unique_id, start_index=str(0)) hs = s.get(handshake).result().content json_hs = json.loads(hs) total_count = int(json_hs['meta']['total_count']) how_many_queries = total_count / 10 + 1 query_urls = [api_format.format(unique_id=unique_id, start_index=str(i * 10)) for i in xrange(0, how_many_queries, 1)] reviews = [ { 'movie_title': r['movie_title'], 'rating': r['rating'], 'text': r['text'], 'updated_at': time.mktime(dateutil.parser.parse(r['updated_at']).timetuple()), 'comment_id': r['comment_id'] } for qu in query_urls for r in json.loads(s.get(qu).result().content)['data'] ] return reviews
def test_redirect(self): """ Tests for the ability to cleanly handle redirects. """ sess = FuturesSession() future = sess.get(httpbin('redirect-to?url=get')) self.assertIsInstance(future, Future) resp = future.result() self.assertIsInstance(resp, Response) self.assertEqual(200, resp.status_code) future = sess.get(httpbin('redirect-to?url=status/404')) resp = future.result() self.assertEqual(404, resp.status_code)
def asyncDepartureBoards(self, request_list): header = {"Authorization": self.auth.token} url = "https://api.vasttrafik.se/bin/rest.exe/v2/departureBoard" # Start a session for the async requests session = FuturesSession() reqs = [] for req in request_list: # Send the requests req["format"] = "json" future = session.get(url, headers=header, params=req) reqs.append(future) time.sleep(0.02) # Without this everything breaks responses = [] for req in reqs: # Get the results r = req.result() responses.append(r) # Check for errors resp = self.auth.checkResponses(responses) output = [] for response in resp: output.append(response.json()) return output
def _chapter_pages(self, soup, html): # For webtoons, all pages are shown in a single page. # When that's the case, there's this element that asks if you want to # view page-by-page instead. Let's use this element to check if we're # parsing a webtoon chapter. webtoon = soup.find('a', href='?supress_webtoon=t') if webtoon is not None: img_tags = soup.find_all(_page_img_tag) return [ tag['src'] for tag in img_tags ] # a <select> tag has options that each points to a page opts = soup.find('select', id='page_select').find_all('option') urls = [opt['value'] for opt in opts] # Page 1 has already been fetched (stored in this html param, duh!) # so let's save ourselves an http request pages_htmls = [html] urls = urls[1:] session = FuturesSession() for order, url in enumerate(urls): res = session.get(url).result() if res.status_code != 200: raise HtmlError('cannot fetch') pages_htmls.append(res.content) returns = [] for page_html in pages_htmls: soup = BeautifulSoup(page_html) img_url = soup.find('img', id='comic_page')['src'] returns.append(img_url) return returns
def parse_feeds(): session = FuturesSession(max_workers=256) feeds = RssFeed.query.all() urls = [] for feed in feeds: urls.append(feed.link) responses = [] delete_entries() for url in urls: responses.append(session.get(url)) for i in range(len(urls)): try: feed = feeds[i] data = responses[i].result() data = feedparser.parse(data.text) try: base_site = get_site_from_link(data.feed.link) favicon = base_site + "/favicon.ico" feed.favicon = favicon db.session.commit() except: db.session.rollback() add_new_entries(data, feed, base_site) # logging.info(str(time.time()) + " " + str(i)) except Exception as e: print(e) logging.info(e) # delete_entries() uncomment this and comment the previous to delete every # entry not in the date, also the ones in the feed file logging.info("feeds parsed") logging.info(datetime.now())
def getVenue(self, venueId): dictres = None if check_cache('venue_' + venueId + '.json', False): cache = retrieve_cache('venue_' + venueId + '.json', False) dictres = json.loads(cache) else: session = FuturesSession(executor=ThreadPoolExecutor( max_workers=1)) rawtime = date.today() - timedelta(days=1) parsedtime = rawtime.strftime('%Y%m%d') params = dict(client_id=config.FOURSQUARE_CLIENT_ID, client_secret=config.FOURSQUARE_CLIENT_SECRET, v=parsedtime) futures.append( session.get('https://api.foursquare.com/v2/venues/' + venueId, params=params)) response = future.result() if response.status_code != 200: return None content = response.text dictres = json.loads(content) vid = dictres['response']['venue']['id'] store_cache(content, 'venue_' + vid + '.json') if dictres is None: return None return mp.f_location(dictres)
def _chapter_pages(self, soup, html): # For webtoons, all pages are shown in a single page. # When that's the case, there's this element that asks if you want to # view page-by-page instead. Let's use this element to check if we're # parsing a webtoon chapter. webtoon = soup.find("a", href="?supress_webtoon=t") if webtoon is not None: img_tags = soup.find_all(_page_img_tag) return [tag["src"] for tag in img_tags] # a <select> tag has options that each points to a page opts = soup.find("select", id="page_select").find_all("option") urls = [opt["value"] for opt in opts] # Page 1 has already been fetched (stored in this html param, duh!) # so let's save ourselves an http request pages_htmls = [html] urls = urls[1:] session = FuturesSession() for order, url in enumerate(urls): res = session.get(url).result() if res.status_code != 200: raise HtmlError("cannot fetch") pages_htmls.append(res.content) returns = [] for page_html in pages_htmls: soup = BeautifulSoup(page_html) img_url = soup.find("img", id="comic_page")["src"] returns.append(img_url) return returns
def load_services(bootstrap_url): session = FuturesSession(max_workers=10) jobs = { url: session.get(url, headers={"Accept": "application/json"}, params={"type": "service"}) for url in load_sls_hosts(bootstrap_url) } all_responses = {} for url, job in jobs.items(): try: rsp = job.result() except requests.ConnectionError as e: logger.error(str(e)) continue if rsp.status_code == 200: all_responses[url] = rsp.json() else: logger.error("'%s' returned status code %d" % (url, rsp.status_code)) all_responses[url] = [] return all_responses
def asyncDepartureBoards(self, stops, **kwargs): token, scope = self.auth.get_token() header = {"Authorization": token} url = "https://api.vasttrafik.se/bin/rest.exe/v2/departureBoard" kwargs["format"] = "json" # Start a session for the async requests session = FuturesSession() reqs = [] for stop in stops: # Send the requests params = kwargs params["id"] = stop future = session.get(url, headers=header, params=params) reqs.append(future) time.sleep(0.01) # Without this everything breaks responses = [] for req in reqs: # Get the results r = req.result() responses.append(r) # Check for errors resp = self.auth.check_responses(responses, scope) output = [] for response in resp: output.append(response.json()) return output
def get_blocks(*heights): urls = [get_block_coinsecrets_url(h) for h in heights] session = FuturesSession() reqs = [session.get(url) for url in urls] responses = [r.result() for r in reqs] resps_json = [json.loads(r.content.decode()) for r in responses] return resps_json
def check_responses(self, response_list, scope): fine = True for resp in response_list: # Check for any errors if resp.status_code != 200: fine = False if fine: return response_list else: print("Renewing token " + str(scope)) token = self.__renew_token(scope) header = {"Authorization": token} # Retry! session = FuturesSession() reqs = [] for resp in response_list: # Send the new requests url = resp.url reqs.append(session.get(url, headers=header)) time.sleep(0.01) # Get the results resps = [] for req in reqs: resps.append(req.result()) if resps[0].status_code != 200: raise requests.exceptions.HTTPError(f'{resps[0].status_code} {resps[0].reason}') return resps
def _chapter_pages(self, soup, html): # a <select> tag has options that each points to a page neighbour = soup.find('select', id='combobox').find_next_sibling('select') opts = neighbour.find_all('option') urls = [opt['value'] for opt in opts] # Page 1 has already been fetched (stored in this html param, duh!) # so let's save ourselves an http request pages_htmls = [html] urls = urls[1:] session = FuturesSession() for order, url in enumerate(urls): uri = self.netlocs[2] + url print(uri) res = session.get(uri).result() if res.status_code != 200: raise HtmlError('cannot fetch') pages_htmls.append(res.content) returns = [] for page_html in pages_htmls: soup = BeautifulSoup(page_html) img_url = soup.find('img', id='mainImg')['src'] returns.append(img_url) return returns
def get_frames(self, count): """Get a list of images from Environment Canada.""" soup = BeautifulSoup(requests.get(self.IMAGES_URL.format(self.station_code)).text, 'html.parser') image_links = [tag['href'] for tag in soup.find_all('a') if '.gif' in tag['href']] if len([i for i in image_links[:8] if 'COMP' in i]) > 4: image_string = '_'.join([self.station_code, 'COMP_PRECIPET', self.get_precip_type() + '.gif']) else: image_string = '_'.join([self.station_code, 'PRECIPET', self.get_precip_type() + '.gif']) images = [tag['href'] for tag in soup.find_all('a') if image_string in tag['href']] futures = [] session = FuturesSession(max_workers=count) for i in reversed(images[:count]): url = self.FRAME_URL.format(self.station_code, i) futures.append(session.get(url=url).result().content) def add_layers(frame): frame_bytesio = BytesIO() base = Image.open(BytesIO(frame)).convert('RGBA') base.alpha_composite(self.roads) base.alpha_composite(self.cities) base.save(frame_bytesio, 'GIF') frame_bytesio.seek(0) return frame_bytesio.read() frames = [add_layers(f) for f in futures if f[0:3] == b'GIF'] """Repeat last frame.""" for i in range(0, 2): # pylint: disable=unused-variable frames.append(frames[count - 1]) return frames
class BlueFloodMetricsClient(object): def __init__(self, token, project_id, executors): self.token = token self.project_id = project_id self.session = FuturesSession(max_workers=executors) self.headers = { 'X-Project-ID': self.project_id } if self.token: self.headers.update({ 'X-Auth-Token': self.token }) self.session.headers.update(self.headers) def async_requests(self, urls): futures_results = [] for url in urls: LOG.info("Request made to URL: {0}".format(url)) futures_results.append(self.session.get(url)) responses = [] for future in futures.as_completed(fs=futures_results): resp = future.result() LOG.info("Request completed to URL: {0}".format(resp.url)) responses.append((resp)) return responses
def get_usgs_nearby_cities(self, earthquake): """ performs request on local earthquake nearby cities url and returns the data """ try: nearest_cities_object = earthquake[ "properties"]["products"]["nearby-cities"] nearest_cities_url = nearest_cities_object[0][ "contents"]["nearby-cities.json"]["url"] except: nearest_cities_url = None if nearest_cities_url: session = FuturesSession(max_workers=1) nearest_cities_response = session.get( nearest_cities_url, headers=app.config["API_MANAGER_HEADERS"]) nearest_cities_details = nearest_cities_response.result().json() list_of_nearby_cities = [] for item in nearest_cities_details: city = NearestCity( id=None, distance=item["distance"], direction=item["direction"], name=item["name"], latitude=item["latitude"], longitude=item["longitude"], population=item["population"], earthquake_id=None ) list_of_nearby_cities.append(city) earthquake["properties"]["nearest_cities_url"] = nearest_cities_url earthquake["properties"]["nearest_cities"] = list_of_nearby_cities else: earthquake["properties"]["nearest_cities_url"] = None earthquake["properties"]["nearest_cities"] = [] return earthquake
def search(self, q='', cat='', indexer='all', **kwargs): self.logger.debug("Searching for %s category %s on indexer %s" % (q, cat, indexer)) if cat: cat = '&cat=' + cat sess = FuturesSession(max_workers=8) job_list = [] if indexer == 'all': for i in NewznabIndexers.select(): cmd = 'search&q=' + urllib2.quote(q.encode(encoding="UTF-8")) + cat + '&extended=1' u = i.apiurl u += cmd u = u.replace('o=json', 'o=xml') job_list.append(u) else: for i in NewznabIndexers.select(): if i.name == indexer: cmd = 'search&q=' + urllib2.quote(q.encode(encoding="UTF-8")) + cat + '&extended=1' u = i.apiurl u += cmd u = u.replace('o=json', 'o=xml') job_list.append(u) result = [] future = [] for url in job_list: try: self.logger.debug('Fetching search results from %s' % url) t = sess.get(url, timeout=60, headers=self.headers) except Exception as e: self.logger.error('%s when fetching %s' % (e, url)) continue future.append(t) for future in cf.as_completed(future): if future.exception() is not None: self.logger.error('Failed to fetch results %s' % (future.exception())) else: f = [] res = future.result() try: provider_res = xmltodict.parse(res.content, attr_prefix='') if provider_res: if 'rss' in provider_res: if 'channel' in provider_res['rss']: if 'item' in provider_res['rss']['channel']: f.append(provider_res['rss']['channel']) if 'error' in provider_res: self.logger.debug('%s %s' % (provider_res['rss']['channel']['title'], provider_res['error']['description'])) except Exception as e: self.logger.error(res.url, e, exc_info=True) result.append(f) return result
class RemoteTTS(TTS): """ Abstract class for a Remote TTS engine implementation. It provides a common logic to perform multiple requests by splitting the whole sentence into small ones. """ def __init__(self, lang, voice, url, api_path, validator): super(RemoteTTS, self).__init__(lang, voice, validator) self.api_path = api_path self.url = remove_last_slash(url) self.session = FuturesSession() def execute(self, sentence): phrases = self.__get_phrases(sentence) if len(phrases) > 0: for req in self.__requests(phrases): try: self.__play(req) except Exception as e: LOGGER.error(e.message) @staticmethod def __get_phrases(sentence): phrases = re.split('\.+[\s+|\n]', sentence) phrases = [p.replace('\n', '').strip() for p in phrases] phrases = [p for p in phrases if len(p) > 0] return phrases def __requests(self, phrases): reqs = [] for p in phrases: reqs.append(self.__request(p)) return reqs def __request(self, p): return self.session.get( self.url + self.api_path, params=self.build_request_params(p), timeout=10, verify=False) @abc.abstractmethod def build_request_params(self, sentence): pass def __play(self, req): resp = req.result() if resp.status_code == 200: self.__save(resp.content) play_wav(self.filename).communicate() else: LOGGER.error( '%s Http Error: %s for url: %s' % (resp.status_code, resp.reason, resp.url)) def __save(self, data): with open(self.filename, 'wb') as f: f.write(data)
def request_product(auth_token, duns, product_code, version='3.1'): session = FuturesSession() url = 'https://maxcvservices.dnb.com/V' + version + '/organizations/' + duns + '/products/' + product_code url += "?OrderReasonCode=6332" print url headers = {'Authorization': auth_token} future = session.get(url, headers=headers) return future
def async_requests(locations, site=None): session = FuturesSession() check_date = datetime.now() + timedelta(hours=-4) for location in locations: gig = Gigs.select().where(Gigs.location.contains(location)).order_by(Gigs.datetime.desc()).first() if (gig is None) or ((datetime.strptime(gig.datetime, '%Y-%m-%d %H:%M') < check_date)): url = "https://{}.craigslist.org/search/{}/".format(location, (site or CRAIGSLIST_SITE)) future = session.get(url, background_callback=insert_callback)
def get_games_chessdotcom(username: str, limit: PositiveInt = 10) -> dict: """ Get all the games from a chess.com user. """ session = FuturesSession() # get the list of monthly archives url = urljoin(CHESSDOTCOM_API_HOST, f"pub/player/{username}/games/archives") archives = session.get(url).result() archives.raise_for_status() # fetch all the games games = [] with FuturesSession() as session: futures = [ session.get(url) for url in reversed(archives.json()["archives"]) ] for future in futures: monthly_games = future.result() sleep = 1 while monthly_games.status_code == 429: # pragma: no cover msg = (f"Sleeping for {sleep}s while " f"getting chess.com games for {username}.") logger.debug(msg) time.sleep(2 * sleep) monthly_games = session.get(url).result() monthly_games.raise_for_status() for game in monthly_games.json()["games"]: game["white"]["name"] = game["white"].pop("username") game["white"]["url"] = game["white"].pop("@id") game["black"]["name"] = game["black"].pop("username") game["black"]["url"] = game["black"].pop("@id") game["eco_url"] = game.pop("eco", None) game["tournament_url"] = game.pop("tournament", None) game["match_url"] = game.pop("match", None) games.append(game) if len(games) >= limit: break if len(games) >= limit: break [Game(**game) for game in games[:limit]] # just to push through model validation return make_response(200, games[:limit])
def get_subscriptions(feeds, workers): session = FuturesSession(max_workers=workers) futures = [session.get(f, hooks={'response': feed_to_dicts}) for f in feeds] entry_lists = [f.result().data for f in futures] subscriptions = sorted([i for s in entry_lists for i in s], key=lambda x: x['published'], reverse=True) return subscriptions
def requestPool(parameters, url): """ Generator that asynchronously processes profile requests and yields profile futures. """ session = FuturesSession(max_workers=10) for parameter in parameters: future = session.get(url, params=parameter) yield future
def async_next(self, list_url): '''utility to dowload like async.io multiple url and send them to extract_nexts ''' session = FuturesSession(max_workers=5) for url in list_url: future = session.get(url) future.add_done_callback(self.extract_nexts)
def requestsAsync(urls, header, pageSize): '''helper function to make async HTTP requests''' session = FuturesSession(executor=cf.ThreadPoolExecutor(max_workers=10)) responses = {} for url in urls: request = session.get(url['url'], headers=header) responses[request] = {'cHash': url['cHash']} return responses
def fetch(self): session = FuturesSession() dat = session.get(self.url).result().json() if self.field: dat = dat[self.field] if self.records is False: dat = [dat] raise Return(dat)
def send_requests(self, urls): session = FuturesSession() futures = [session.get(u, headers=self.headers) for u in urls] result = [] for f in futures: res = json.loads(f.result().text)['result']['match'] # [{},{},...] result += res return result
def download_all_sites(sites): session = FuturesSession() req_list = [] for url in sites: req_list.append(session.get(url)) for req, url in zip(req_list, sites): response = req.result() print(f"Read {len(response.content)} from {url}")
def asyncRequest(self, reqUrl=None, urlParams=None, targetArr=None): taskUrlname = reqUrl.split("/")[3] t1 = time.time() while True: if self.future_req: session = FuturesSession(max_workers=self.chunksize) if urlParams == None: rs = (session.get(reqUrl.format(i)) for i in targetArr) else: rs = (session.get(reqUrl.format(i, *urlParams)) for i in targetArr) try: myresponse = list(map(lambda x: x.result(), rs)) except: continue else: if not self.grequests_imported: import grequests grequests_imported = True if urlParams == None: rs = (grequests.get(reqUrl.format(i), proxies=self.proxy, timeout=10) for i in targetArr) else: rs = (grequests.get(reqUrl.format(i, *urlParams), proxies=self.proxy, timeout=10) for i in targetArr) try: myresponse = grequests.map(rs) except: continue status = [ int(i.status_code == 200) for i in myresponse if i != None ] httpstatus = sum(status) print("sum of http200Status {0} : {1}".format( taskUrlname, httpstatus)) if taskUrlname == "ISteamUserStats": httpstatus += 1 self.proxySetting() if len(status) != 0 and httpstatus != 0: break #print("t1",time.time()-t1) return myresponse, httpstatus
def get(request): session = FuturesSession(max_workers=1) future = next(as_completed([session.get( request.url, headers=request.headers, timeout=request.timeout)])) if future.exception() is not None: return DownloadError(request, future.exception()) else: resp = future.result() return HtmlDocument(resp.url, resp.content)
def fetch_pe_ratios(self): self.pe_ratios = MSNMoney(self.ticker_symbol) session = FuturesSession() rpc = session.get(self.pe_ratios.url, allow_redirects=True, hooks={ 'response': self.parse_pe_ratios, }) self.rpcs.append(rpc)
def search_restaurants(loc_id, res_names, cat_ids, cu_ids, establ_ids, connection_session=None): url = 'https://developers.zomato.com/api/v2.1/search' start = 0 headers = {'user_key': '85955a2247d2beb1f5ecadf80fbc4666'} session = FuturesSession() futures = [] ids = [] for res_name in res_names: params = { 'entity_id': loc_id, 'q': res_name, 'cuisine': list_to_string(cu_ids), 'establishment_type': list_to_string(establ_ids), 'category': list_to_string(cat_ids), 'entity_type': 'city', 'start': start } response = connection_session.get( url, headers=headers, params=params) if connection_session else requests.get( url, headers=headers, params=params) if response: body = response.json() while start < body['results_found']: params = { 'entity_id': loc_id, 'q': res_name, 'cuisine': list_to_string(cu_ids), 'establishment_type': list_to_string(establ_ids), 'category': list_to_string(cat_ids), 'entity_type': 'city', 'start': start } futures.append(session.get(url, headers=headers, params=params)) start += 20 start = 0 for future in cf.as_completed(futures): response = future.result() if response: body = response.json() for restaurant in body['restaurants']: ids.append(restaurant['restaurant']['R']['res_id']) return ids
def check_on_site(entries): """ Checks those entries on gepetto's website """ session = FuturesSession(executor=ThreadPoolExecutor(max_workers=40)) for future in [ session.get(GEPETTO_URL % entry['ID']) for entry in entries ]: response = future.result() if b'Invalid bibtex entry' in response.content: print('INVALID', response.url)
def fetch_yahoo_finance_quote(self): self.yahoo_finance_quote = YahooFinanceQuote(self.ticker_symbol) session = FuturesSession() rpc = session.get(self.yahoo_finance_quote.url, allow_redirects=True, hooks={ 'response': self.parse_yahoo_finance_quote, }) self.rpcs.append(rpc)
def shopee(keyword): url = "https://shopee.tw/api/v2/search_items/?by=ctime&keyword={}&limit=50&newest=0&order=desc&page_type=search".format( urllib.parse.quote_plus(keyword)) title = "蝦皮搜尋 - {}".format(keyword) feed = feedgen.feed.FeedGenerator() feed.author({"name": "Feed Generator"}) feed.id(url) feed.link(href=url, rel="alternate") feed.title(title) r = requests.get(url, headers={"User-agent": user_agent}, timeout=5) body = json.loads(r.text) session = FuturesSession(executor=ThreadPoolExecutor(max_workers=10)) futures = [] for item in body["items"]: itemid = item["itemid"] name = item["name"] shopid = item["shopid"] itemapi_url = "https://shopee.tw/api/v2/item/get?itemid=%d&shopid=%d" % ( itemid, shopid, ) futures.append( session.get(itemapi_url, headers={"User-agent": user_agent}, timeout=5)) for f in futures: r = f.result() item = json.loads(r.text)["item"] itemid = item["itemid"] name = item["name"] shopid = item["shopid"] prod_url = "https://shopee.tw/product/%d/%d" % (shopid, itemid) img_url = "https://cf.shopee.tw/file/%s" % (item["image"]) content = '{}<br/><img alt="{}" src="{}"/>'.format( html.escape(name), html.escape(name), html.escape(img_url)) entry = feed.add_entry() entry.content(content, type="xhtml") entry.id(prod_url) entry.link(href=prod_url) entry.title(name) bottle.response.set_header("Cache-Control", "max-age=300,public") bottle.response.set_header("Content-Type", "application/atom+xml") return feed.atom_str()
class HFStory(Story): publisher = "hentai-foundry.com" author = chapters = title = None def __init__(self, url): self.url = url # need to be slightly careful around cookies/etc self.session = requests.Session() self.session.headers["User-Agent"] = "Mozilla/5" self.futures = FuturesSession(session=self.session, max_workers=5) r = self.session.get(self.url) if not r.ok: raise IOError("Error: {}".format(r.status_code)) self.soup = soupify(r.content) a = self.soup.find(id="frontPage_link") if a: r = self.session.get(urljoin(self.url, a["href"] + "&size=1000"), cookies=r.cookies) if not r.ok: raise IOError("Error: {}".format(r.status_code)) self.soup = soupify(r.content) if self.soup.find(id="viewChapter"): self.url = url = urljoin( url, self.soup.select_one(".storyRead a:not(.pdfLink)")["href"]) self.soup = soupify_request(self.futures.get(url)) self.author = self.soup.select_one( ".storyInfo a[href^='/user']").text.strip() self.title = self.soup.select_one( ".titlebar a[href^='/stories']").text.strip() box = self.soup.find("h2", text="Chapters").parent.find(class_="boxbody") self.chapters = [ HFChapter(self.futures.get(urljoin(self.url, p.find("a")["href"]))) for p in box.find_all("p") ]
def fetch_raw_habr_pages_requests_futures(pages=10): ''' Получить сырые данные с хабра ''' session = FuturesSession(executor=ThreadPoolExecutor(max_workers=20), session=Session()) pages_habr = [] for page_number in range(1, pages + 1): r = session.get('https://habr.com/all/page%d/' % page_number) pages_habr.append(r) pages_habr = [r.result().text for r in pages_habr] return pages_habr
def collect_html(btags): session = FuturesSession() url_prefix = 'http://playoverwatch.com/en-us/career/pc/' btag_urls = [url_prefix + quote(btag) for btag in btags] btag_htmls = [ session.get(url).result().text[241780:241850] for url in btag_urls ] return btag_htmls[0].find('u-align-center h5">3685<')
def load_formats(): session = FuturesSession() def bg_cb(sess, resp): resp.data = utils.load_js_obj_literal(resp.text) future = session.get('https://raw.githubusercontent.com/Zarel/Pokemon-Showdown/master/data/formats-data.js', background_callback=bg_cb) r = future.result() return r.data
def retrieve_users_status(contest_id, handles): session = FuturesSession(max_workers=1) futures = {} for handle in handles: futures[handle] = session.get("http://codeforces.com/api/contest.status?contestId=%d&handle=%s" % (contest_id, handle)) ret = {} for handle, future in futures.items(): response = future.result() ret[handle] = response.json()['result'] return ret
def create_html_request(word: str, old_id: int, session: FuturesSession): """Create a request to get the HTML of a word definition page.""" print(f"Creating request for '{word}-{old_id}' HTML.") def on_load(response, *args, **kwargs): save_html_test_file(word, old_id, response) return session.get(wiktionary_base_url + word, params={'oldid': old_id}, hooks={'response': on_load})
def get_url_batch(url_list, use_ssl=False, callback='', threads=5): """ Processes a list of URLs, sending the results back to the calling function in real-time via the `callback` parameter """ # Start a counter for a status message tick = {} tick['total'] = len(url_list) tick['current'] = 0 # Break the url list into smaller lists based on thread size queue = [url_list[x:x + threads] for x in range(0, len(url_list), threads)] # Define the protocol if use_ssl: proto = 'https://' else: proto = 'http://' # Start a requests object session = FuturesSession(executor=ThreadPoolExecutor(max_workers=threads)) # Using the async requests-futures module, work in batches based on # the 'queue' list created above. Call each URL, sending the results # back to the callback function. for batch in queue: batch_pending = {} batch_results = {} # First, grab the pending async request and store it in a dict for url in batch: batch_pending[url] = session.get(proto + url) # Then, grab all the results from the queue for url in batch_pending: batch_results[url] = batch_pending[url].result() # Now, send all the results to the callback function for analysis # We need a way to stop processing unnecessary brute-forces, so the # callback may tell us to bail out. for url in batch_results: check = callback(batch_results[url]) if check == 'breakout': return # Refresh a status message tick['current'] += threads sys.stdout.flush() sys.stdout.write(" {}/{} complete...".format( tick['current'], tick['total'])) sys.stdout.write('\r') # Clear the status message sys.stdout.write(' \r')
class Fetcher: def __init__(self, pages_packet): """ Pages structure: { URL: (String base_url->[String individual_url]), ... } """ # Singleton self.session = FuturesSession() self.pages_packet = pages_packet def _format_urls(self): """ Using the mapping functions to create all variants of the urls, create a list of all url variants. :return: all url variants. :rtype: list """ output = list() for url in Urls: mapper = self.pages_packet.get(url) pages = mapper() output += pages return output def get_results(self): """ Fetch all HTML pages asynchronously. Return object structure: { "URL_NAME": [HTML(page)], ... } Use get() to get the HTML of a given page. :return: each page name with it's corresponding result. :rtype: dict """ results = list() urls = self._format_urls() for url in urls: request = self.session.get(url) results.append(request.result()) output = dict(zip([url.name for url in Urls], results)) return output
def add_all_matches(*years: Iterable[int], new: bool) -> None: """ Given a list of years, analyzes all matches from those years. :param years: Sequence of one or more years :param new: Whether or not to only add new matches """ years = [*years] print("Executing for years: %s" % years) requester = FuturesSession(executor=ProcessPoolExecutor(30), session=requests.Session()) event_get = lambda e: requester.get(event_url_template(event=e.key), headers=__api_key) teams_get = lambda e: requester.get(event_teams_url_template(event=e.key), headers=__api_key) matches_get = lambda e: requester.get(event_matches_url_template(event=e.key), headers=__api_key) if new: events = Event.objects.prefetch_related('alliances').filter( year__in=years, end_date__lt=datetime.now()).annotate(match_count=Count('match')).filter( match_count=0).order_by('end_date').all() else: events = Event.objects.prefetch_related('alliances').filter( year__in=years, end_date__lt=datetime.now()).order_by('end_date').all() print("Starting {} HTTP requests split between {} processes.".format(3 * len(events), requester.executor._max_workers)) matches_futures = [matches_get(e) for e in events] # type: List[Future] event_futures = [event_get(e) for e in events] # type: List[Future] event_teams_futures = [teams_get(e) for e in events] # type: List[Future] print("Waiting on HTTP requests.") wait(matches_futures + event_futures + event_teams_futures) requester.executor.shutdown(wait=True) arg_list = zip(events, [list_of_matches_json_converter(f.result().json()) for f in matches_futures], event_futures, event_teams_futures) for args in arg_list: _add_matches_from_event(*args)
def _async_requests(urls): """ Sends multiple non-blocking requests. Returns a list of responses. :param urls: List of urls """ session = FuturesSession(max_workers=30) futures = [session.get(url) for url in urls] return [future.result() for future in futures]
def test_supplied_session(self): """ Tests the `session` keyword argument. """ requests_session = session() requests_session.headers['Foo'] = 'bar' sess = FuturesSession(session=requests_session) future = sess.get(httpbin('headers')) self.assertIsInstance(future, Future) resp = future.result() self.assertIsInstance(resp, Response) self.assertEqual(200, resp.status_code) self.assertEqual(resp.json()['headers']['Foo'], 'bar')
def main(): urls = {} requests = [] session = FuturesSession(max_workers=10) for year in YEARS_TO_PARSE: landing_page = SEARCH_LANDING + '&year=' + str(year) landing_res = session.get(landing_page).result() landing_bs = BS(landing_res.content, 'html5lib') number_span = landing_bs.select('li.ep_tag_selected span')[0].text number_of_question = int(re.findall(r'\d+', number_span)[0]) number_of_pages = math.ceil( number_of_question / 10) # change to per page for page_num in range(1, number_of_pages + 1): res = session.get(landing_page + '¤tPage=' + str(page_num)) requests.append(res) for request in tqdm(requests): try: request_result = request.result() except ConnectionError: print( 'Due to the ConnectionError page {} hasn\'t been parsed'.format(page_num)) continue page = BS(request_result.content, "html5lib") if page: for notice in page.select('.results div.notice'): for url in notice.select('ul.documents li a'): title_text = notice.select( 'p.title a.result_details_link')[0].text title_date = notice.select( 'div.date_reference span.date')[0].text question_format = url.get('href').split('.')[-1] title = '{} ({}).{}'.format( title_text, title_date, question_format) title = re.sub(r'[\n\r\t]', '', title) title = title.replace('/', '-') urls[url.get('href')] = title else: break if not os.path.exists(FOLDER_TO_DOWNLOAD): os.mkdir(FOLDER_TO_DOWNLOAD) download(urls, FOLDER_TO_DOWNLOAD)
def testMakePosts(self): s = FuturesSession() new_post = json.dumps({'content': 'testing'}) print new_post p = s.post('https://cs242project.herokuapp.com/submitPost', data=new_post) res = p.result() print res print res.content r = s.get('https://cs242project.herokuapp.com/getPosts') res2 = r.result() print res2.content self.assertEqual("test", "test")
def add_all(*years: Iterable[int]): years = [*years] requester = FuturesSession(executor=ProcessPoolExecutor(MAX_WORKERS), session=requests.Session()) api_key = settings.TBA_API_HEADERS event_list_get = lambda y: requester.get(event_by_year_url_template(year=y), headers=api_key) event_get = lambda key: requester.get(event_url_template(event=key), headers=api_key) event_teams_get = lambda key: requester.get(event_teams_url_template(event=key), headers=api_key) print("Getting event lists for years: %s" % years) event_list_futures = [event_list_get(y) for y in years] print("Waiting on %d requests..." % len(years)) wait(event_list_futures) print("Done!\n") event_lists = [f.result().json() for f in event_list_futures] event_data_jsons = [item for year_data in event_lists for item in year_data] print("Grabbing event keys...") event_keys = [event_data['key'] for event_data in event_data_jsons] print("Starting {} requests for event data and teams-by-event data, split between {} processes...".format( 2 * len(event_keys), MAX_WORKERS)) event_json_futures = [event_get(key) for key in event_keys] event_team_json_futures = [event_teams_get(key) for key in event_keys] print("Waiting...") wait(event_json_futures + event_team_json_futures) requester.executor.shutdown() print("Done!\n") event_jsons = [f.result().json() for f in event_json_futures] event_team_json = [f.result().json() for f in event_team_json_futures] print("Adding teams data to event data under 'teams' field...") event_jsons = [dict(e, teams=t) for e, t in zip(event_jsons, event_team_json)] arg_list = zip(event_keys, event_jsons) for args in arg_list: add_event(*args)
def listings(base_url, needles): """ takes the needles as a || seperated list of needles and returns a map of neeldes to a list of dictionaries for matches """ needles = [kw.strip() for kw in needles.split("||")] # Prepare the URL for requests url = base_url + "/tv/getProgInfo?major={}" session = FuturesSession(max_workers=30) # initialize our matches matches = {} for needle in needles: matches[needle] = [] # Check each channel concurrently responses = {} for i in SCAN_RANGE: responses[i] = session.get(url.format(i)) # Wait on all responses for i in SCAN_RANGE: responses[i] = responses[i].result() log.debug("channel {} has responded".format(i)) # Filter out non-200 responses responses_200 = [] for i in SCAN_RANGE: if responses[i].status_code == 200: responses_200.append(responses[i].text) # Make nice JSON of listings listings = [] for response in responses_200: tmp = json.loads(response) tmp = { "title": tmp["title"], "major": tmp["major"], "callsign": tmp["callsign"], "duration": tmp["duration"], "startTime": tmp["startTime"], "isRecording": tmp["isRecording"], } listings.append(tmp) # Map listings to matching needles for listing in listings: for needle in needles: if needle.lower() in listing["title"].lower(): log.info("Match for {} with {}".format(needle, listing["title"])) matches[needle].append(listing) return matches
def extract_all(name): spider = SPIDERS[name] session = FuturesSession(max_workers=10) futures=[(pid, session.get(url)) for pid, url in get_urls()] future_responses = [(pid, future.result()) for pid, future in futures] for pid,response in future_responses: spider.extract(pid,response)
def get_usgs_details_response(self, url): """ performs request on local earthquake details url and returns the data """ session = FuturesSession(max_workers=1) usgs_api_details = session.get( url, headers=app.config["API_MANAGER_HEADERS"]) try: earthquake_details = usgs_api_details.result().json() return earthquake_details except requests.exceptions as exception: logger.error("%s: %s" % (exception)) return False
def resps_from_urls(potential_urls): """ Gather valid responses from the list of potential urls """ resps = [] session = FuturesSession(max_workers=30) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"} for url in potential_urls: resp = session.get(url, timeout=25, verify=False, headers=headers) resps.append(resp) return resps