def main(): config = get_config() geocode_function = get_geocode_function(config) writer = None try: with CacheHandler(config['cache_location'], enabled=config['cache_enabled'], size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache: for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])): columns = current_df.columns.tolist() columns_to_append = [f['column'] for f in config['features'] if not f['column'] in columns] if columns_to_append: index = max(columns.index(config['lat_column']), columns.index(config['lng_column'])) current_df = current_df.reindex(columns=columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False) if not config['batch_enabled']: results = zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache))) for feature, result in zip(config['features'], results): current_df[feature['column']] = result else: batch = [] for i, row in current_df.iterrows(): if len(batch) == config['batch_size']: perform_geocode_batch(current_df, config, geocode_function, cache, batch) batch = [] lat = row[config['lat_column']] lng = row[config['lng_column']] try: if any([is_empty(row[f['column']]) for f in config['features']]): res = cache[(lat, lng)] else: res = {} for f in config['features']: res[f['name']] = row[f['column']] for feature in config['features']: current_df.loc[i, feature['column']] = res[feature['name']] except KeyError as e: batch.append((i, (lat, lng))) if len(batch) > 0: perform_geocode_batch(current_df, config, geocode_function, cache, batch) # First loop, we write the schema before creating the dataset writer if writer is None: config['output_ds'].write_schema_from_dataframe(current_df) writer = config['output_ds'].get_writer() writer.write_dataframe(current_df) finally: if writer is not None: writer.close()
def cache_check(self): # This block here checks for already cached response and if present returns one self.cache_handler = CacheHandler(self.application.cache_dir, self.request, self.application.cookie_regex, self.application.cookie_blacklist) self.cached_response = self.cache_handler.load() self.process_request()
def main(): config = get_config() geocode_function = get_geocode_function(config) input_df = config['input_ds'].get_dataframe() writer = None try: # Creating a fake or real cache depending on user's choice with CacheHandler(config['cache_location'], enabled=config['cache_enabled'], \ size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache: for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])): columns = current_df.columns.tolist() # Adding columns to the schema columns_to_append = [config[c] for c in ['latitude', 'longitude'] if not config[c] in columns] if columns_to_append: index = columns.index(config['address_column']) current_df = current_df.reindex(columns = columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False) # Normal, 1 by 1 geocoding when batch is not enabled/available if not config['batch_enabled']: current_df[config['latitude']], current_df[config['longitude']] = \ zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache))) # Batch creation and geocoding otherwise else: batch = [] for i, row in current_df.iterrows(): if len(batch) == config['batch_size']: perform_geocode_batch(current_df, config, geocode_function, cache, batch) batch = [] address = row[config['address_column']] try: if any([is_empty(row[config[c]]) for c in ['latitude', 'longitude']]): res = cache[address] else: res = [row[config[c]] for c in ['latitude', 'longitude']] current_df.loc[i, config['latitude']] = res[0] current_df.loc[i, config['longitude']] = res[1] except KeyError: batch.append((i, address)) if len(batch) > 0: perform_geocode_batch(current_df, config, geocode_function, cache, batch) # First loop, we write the schema before creating the dataset writer if writer is None: config['output_ds'].write_schema_from_dataframe(current_df) writer = config['output_ds'].get_writer() writer.write_dataframe(current_df) finally: if writer is not None: writer.close()
def __init__(self, cache_file_name, root, client): self.path = '' self.id = "0" self.type = self.BOX_FOLDER self.modified_at = None self.size = 0 self.cache = CacheHandler(cache_file_name) self.root = root self.client = client
def on_close(self): """ Called when websocket is closed. So handshake request-response pair along with websocket data as response body is saved """ # Required for cache_handler self.handshake_response = tornado.httpclient.HTTPResponse( self.handshake_request, self.upstream_connection.code, headers=self.upstream_connection.headers, request_time=0) # Procedure for dumping a tornado request-response self.cache_handler = CacheHandler(self.application.cache_dir, self.handshake_request, self.application.cookie_regex, self.application.cookie_blacklist) self.cached_response = self.cache_handler.load() self.cache_handler.dump(self.handshake_response)
def get(self): """ * This function handles all requests except the connect request. * Once ssl stream is formed between browser and proxy, the requests are then processed by this function """ # The flow starts here self.request.local_timestamp = datetime.datetime.now() self.request.response_buffer = '' # The requests that come through ssl streams are relative requests, so transparent # proxying is required. The following snippet decides the url that should be passed # to the async client if self.request.uri.startswith(self.request.protocol, 0): # Normal Proxy Request self.request.url = self.request.uri else: # Transparent Proxy Request self.request.url = self.request.protocol + "://" + self.request.host if self.request.uri != '/': # Add uri only if needed self.request.url += self.request.uri # This block here checks for already cached response and if present returns one self.cache_handler = CacheHandler(self.application.cache_dir, self.request, self.application.cookie_regex, self.application.cookie_blacklist) request_hash = yield tornado.gen.Task( self.cache_handler.calculate_hash) self.cached_response = self.cache_handler.load() if self.cached_response: if self.cached_response.body: self.write(self.cached_response.body) self.finish_response(self.cached_response) else: # Request header cleaning for header in restricted_request_headers: try: del self.request.headers[header] except: continue # HTTP auth if exists http_auth_username = None http_auth_password = None http_auth_mode = None if self.application.http_auth: # HTTP AUTH settings host = self.request.host # If default ports are not provided, they are added try: test = self.request.host.index(':') except ValueError: default_ports = {'http': '80', 'https': '443'} try: host = self.request.host + ':' + default_ports[ self.request.protocol] except KeyError: pass # Check if auth is provided for that host try: index = self.application.http_auth_hosts.index(host) http_auth_username = self.application.http_auth_usernames[ index] http_auth_password = self.application.http_auth_passwords[ index] http_auth_mode = self.application.http_auth_modes[index] except ValueError: pass # pycurl is needed for curl client async_client = tornado.curl_httpclient.CurlAsyncHTTPClient() # httprequest object is created and then passed to async client with a callback success_response = False # is used to check the response in the botnet mode while not success_response: #Proxy Switching (botnet_mode) code if self.application.proxy_manager: proxy = self.application.proxy_manager.get_next_available_proxy( ) #print proxy self.application.outbound_ip = proxy["proxy"][0] self.application.outbound_port = int(proxy["proxy"][1]) # httprequest object is created and then passed to async client with a callback request = tornado.httpclient.HTTPRequest( url=self.request.url, method=self.request.method, body=self.request.body if self.request.body else None, headers=self.request.headers, auth_username=http_auth_username, auth_password=http_auth_password, auth_mode=http_auth_mode, follow_redirects=False, use_gzip=True, streaming_callback=self.handle_data_chunk, header_callback=None, proxy_host=self.application.outbound_ip, proxy_port=self.application.outbound_port, proxy_username=self.application.outbound_username, proxy_password=self.application.outbound_password, allow_nonstandard_methods=True, prepare_curl_callback=prepare_curl_callback if self.application.outbound_proxy_type == "socks"\ else None, # socks callback function validate_cert=False) try: response = yield tornado.gen.Task(async_client.fetch, request) except Exception: response = None pass # Request retries for i in range(0, 3): if (response is None) or response.code in [408, 599]: self.request.response_buffer = '' response = yield tornado.gen.Task( async_client.fetch, request) else: success_response = True break #botnet mode code (proxy switching) #checking the status of the proxy (asynchronous) if self.application.proxy_manager and not success_response: proxy_check_req = tornado.httpclient.HTTPRequest( url=self.application.proxy_manager.testing_url, #testing url is google.com use_gzip=True, proxy_host=self.application.outbound_ip, proxy_port=self.application.outbound_port, proxy_username=self.application.outbound_username, proxy_password=self.application.outbound_password, prepare_curl_callback=prepare_curl_callback if self.application.outbound_proxy_type == "socks"\ else None, # socks callback function validate_cert=False) try: proxy_check_resp = yield tornado.gen.Task( async_client.fetch, proxy_check_req) except Exception: pass if proxy_check_resp.code != 200: #self.application.proxy_manager.remove_proxy(proxy) self.application.proxy_manager.remove_proxy( proxy["index"]) else: success_response = True else: success_response = True self.finish_response(response) # Cache the response after finishing the response, so caching time is not included in response time self.cache_handler.dump(response)
for feature in config['features']: df.loc[i, feature['column']] = res[feature['name']] except Exception as e: logging.error("Failed to geocode %s (%s)" % (loc, e)) if __name__ == '__main__': config = get_config() geocode_function = get_geocode_function(config) input_df = config['input_ds'].get_dataframe() writer = None try: with CacheHandler(config['cache_location'], enabled=config['cache_enabled'], \ size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache: for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])): columns = current_df.columns.tolist() columns_to_append = [f['column'] for f in config['features'] if not f['column'] in columns] if columns_to_append: index = max(columns.index(config['lat_column']), columns.index(config['lng_column'])) current_df = current_df.reindex(columns = columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False) if not config['batch_enabled']: results = zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache))) for feature, result in zip(config['features'], results): current_df[feature['column']] = result else:
from throttle import Throttle throttle = Throttle() except ImportError: throttle = None app = Flask(__name__) app.config.from_object('config.ConfigProduction') cache = Cache(app, config={'CACHE_TYPE': 'simple'}) sentry = Sentry(app) # db.init_app(app) # JL HACK ~ disable mysql # Optional Redis cache, for caching Google spreadsheet campaign overrides cache_handler = CacheHandler(app.config['REDIS_URL']) # FFTF Leaderboard handler. Only used if FFTF Leadboard params are passed in leaderboard = FFTFLeaderboard(app.debug, app.config['FFTF_LB_ASYNC_POOL_SIZE'], app.config['FFTF_CALL_LOG_API_KEY']) call_methods = ['GET', 'POST'] data = PoliticalData(cache_handler, app.debug) print "Call Congress is starting up!" def make_cache_key(*args, **kwargs): path = request.path args = str(hash(frozenset(request.args.items())))
import json from flask import Flask, request, jsonify from flask_cors import CORS from cache_handler import CacheHandler from slugify import slugify from db_controller import DbController app = Flask(__name__) db = CacheHandler() app.config["DEBUG"] = True CORS(app) @app.route("/test") def test(): return "test" @app.route("/search") def get_data(): keys = db.get_keys() source = slugify(request.args.get("source", ""), separator="-") destination = slugify(request.args.get("destination", ""), separator="-") price = int(request.args.get("price", "9999999999999")) if source: keys = filter(lambda key: key.find(source) == 9, keys) if destination: keys = filter(lambda key: key.find(destination) > 9, keys)
from requests_html import HTMLSession from journey import Journey from cache_handler import CacheHandler from website_parser import WebsiteParser import fire session = HTMLSession() cache_handler = CacheHandler() class Connections: session = HTMLSession() data = { "post-type": "shop", "currentstepnumber": "1", "search-from": "Split", "search-to": "Zagreb", "search-datetime": "21.10.2018.", "ticket-type": "oneway", } url = "https://www.arriva.com.hr/hr-hr/odabir-polaska" def cache(self, output): cache_handler.add_journeys( self.data["search-from"], self.data["search-to"], self.data["search-datetime"], output,
def get(self): """ * This function handles all requests except the connect request. * Once ssl stream is formed between browser and proxy, the requests are then processed by this function """ self.request.response_buffer = '' # Data for handling headers through a streaming callback # Need to work around for something restricted_response_headers = ['Content-Length', 'Content-Encoding', 'Etag', 'Transfer-Encoding', 'Connection', 'Vary', 'Accept-Ranges', 'Pragma'] # This function is a callback after the async client gets the full response # This method will be improvised with more headers from original responses def handle_response(response): self.set_status(response.code) del self._headers['Server'] for header, value in list(response.headers.items()): if header == "Set-Cookie": self.add_header(header, value) else: if header not in restricted_response_headers: self.set_header(header, value) if self.request.response_buffer: self.cache_handler.dump(response) self.finish() def handle_cached_response(response): self.set_status(response.code) for header, value in list(response.headers.items()): if header == "Set-Cookie": self.add_header(header, value) else: if header not in restricted_response_headers: self.set_header(header, value) self.write(response.body) self.finish() # This function is a callback when a small chunk is received def handle_data_chunk(data): if data: self.write(data) self.request.response_buffer += data # More headers are to be removed for header in ('Connection', 'Pragma', 'Cache-Control', 'If-Modified-Since'): try: del self.request.headers[header] except: continue # The requests that come through ssl streams are relative requests, so transparent # proxying is required. The following snippet decides the url that should be passed # to the async client if self.request.host in self.request.uri.split('/'): # Normal Proxy Request self.request.url = self.request.uri else: # Transparent Proxy Request self.request.url = self.request.protocol + "://" + self.request.host + self.request.uri # This block here checks for already cached response and if present returns one self.cache_handler = CacheHandler( self.application.cache_dir, self.request, self.application.cookie_regex, self.application.cookie_blacklist ) cached_response = self.cache_handler.load() if cached_response: handle_cached_response(cached_response) else: # httprequest object is created and then passed to async client with a callback # pycurl is needed for curl client async_client = tornado.curl_httpclient.CurlAsyncHTTPClient() request = tornado.httpclient.HTTPRequest( url=self.request.url, method=self.request.method, body=self.request.body, headers=self.request.headers, follow_redirects=False, use_gzip=True, streaming_callback=handle_data_chunk, header_callback=None, proxy_host=self.application.outbound_ip, proxy_port=self.application.outbound_port, proxy_username=self.application.outbound_username, proxy_password=self.application.outbound_password, allow_nonstandard_methods=True, validate_cert=False) try: async_client.fetch(request, callback=handle_response) except Exception: pass