예제 #1
0
def main():
    config = get_config()
    geocode_function = get_geocode_function(config)

    writer = None

    try:
        with CacheHandler(config['cache_location'], enabled=config['cache_enabled'],
                          size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache:
            for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])):
                columns = current_df.columns.tolist()

                columns_to_append = [f['column'] for f in config['features'] if not f['column'] in columns]
                if columns_to_append:
                    index = max(columns.index(config['lat_column']), columns.index(config['lng_column']))
                    current_df = current_df.reindex(columns=columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False)

                if not config['batch_enabled']:
                    results = zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache)))

                    for feature, result in zip(config['features'], results):
                        current_df[feature['column']] = result

                else:
                    batch = []

                    for i, row in current_df.iterrows():
                        if len(batch) == config['batch_size']:
                            perform_geocode_batch(current_df, config, geocode_function, cache, batch)
                            batch = []

                        lat = row[config['lat_column']]
                        lng = row[config['lng_column']]

                        try:
                            if any([is_empty(row[f['column']]) for f in config['features']]):
                                res = cache[(lat, lng)]
                            else:
                                res = {}
                                for f in config['features']:
                                    res[f['name']] = row[f['column']]

                            for feature in config['features']:
                                current_df.loc[i, feature['column']] = res[feature['name']]

                        except KeyError as e:
                            batch.append((i, (lat, lng)))

                    if len(batch) > 0:
                        perform_geocode_batch(current_df, config, geocode_function, cache, batch)

                # First loop, we write the schema before creating the dataset writer
                if writer is None:
                    config['output_ds'].write_schema_from_dataframe(current_df)
                    writer = config['output_ds'].get_writer()

                writer.write_dataframe(current_df)
    finally:
        if writer is not None:
            writer.close()
예제 #2
0
 def cache_check(self):
     # This block here checks for already cached response and if present returns one
     self.cache_handler = CacheHandler(self.application.cache_dir,
                                       self.request,
                                       self.application.cookie_regex,
                                       self.application.cookie_blacklist)
     self.cached_response = self.cache_handler.load()
     self.process_request()
예제 #3
0
def main():
    config = get_config()
    geocode_function = get_geocode_function(config)
    input_df = config['input_ds'].get_dataframe()

    writer = None

    try:
        # Creating a fake or real cache depending on user's choice
        with CacheHandler(config['cache_location'], enabled=config['cache_enabled'], \
                          size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache:
            for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])):
                columns = current_df.columns.tolist()

                # Adding columns to the schema
                columns_to_append = [config[c] for c in ['latitude', 'longitude'] if not config[c] in columns]
                if columns_to_append:
                    index = columns.index(config['address_column'])
                    current_df = current_df.reindex(columns = columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False)

                # Normal, 1 by 1 geocoding when batch is not enabled/available
                if not config['batch_enabled']:
                    current_df[config['latitude']], current_df[config['longitude']] = \
                        zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache)))

                # Batch creation and geocoding otherwise
                else:
                    batch = []

                    for i, row in current_df.iterrows():
                        if len(batch) == config['batch_size']:
                            perform_geocode_batch(current_df, config, geocode_function, cache, batch)
                            batch = []

                        address = row[config['address_column']]
                        try:
                            if any([is_empty(row[config[c]]) for c in ['latitude', 'longitude']]):
                                res = cache[address]
                            else:
                                res = [row[config[c]] for c in ['latitude', 'longitude']]

                            current_df.loc[i, config['latitude']] = res[0]
                            current_df.loc[i, config['longitude']] = res[1]
                        except KeyError:
                            batch.append((i, address))
                    
                    if len(batch) > 0:
                        perform_geocode_batch(current_df, config, geocode_function, cache, batch)

                # First loop, we write the schema before creating the dataset writer
                if writer is None:
                    config['output_ds'].write_schema_from_dataframe(current_df)
                    writer = config['output_ds'].get_writer()

                writer.write_dataframe(current_df)
    finally:
        if writer is not None:
            writer.close()
예제 #4
0
 def __init__(self, cache_file_name, root, client):
     self.path = ''
     self.id = "0"
     self.type = self.BOX_FOLDER
     self.modified_at = None
     self.size = 0
     self.cache = CacheHandler(cache_file_name)
     self.root = root
     self.client = client
예제 #5
0
 def on_close(self):
     """
     Called when websocket is closed. So handshake request-response pair along with websocket data as response body is saved
     """
     # Required for cache_handler
     self.handshake_response = tornado.httpclient.HTTPResponse(
         self.handshake_request,
         self.upstream_connection.code,
         headers=self.upstream_connection.headers,
         request_time=0)
     # Procedure for dumping a tornado request-response
     self.cache_handler = CacheHandler(self.application.cache_dir,
                                       self.handshake_request,
                                       self.application.cookie_regex,
                                       self.application.cookie_blacklist)
     self.cached_response = self.cache_handler.load()
     self.cache_handler.dump(self.handshake_response)
예제 #6
0
    def get(self):
        """
        * This function handles all requests except the connect request.
        * Once ssl stream is formed between browser and proxy, the requests are
          then processed by this function
        """
        # The flow starts here
        self.request.local_timestamp = datetime.datetime.now()
        self.request.response_buffer = ''

        # The requests that come through ssl streams are relative requests, so transparent
        # proxying is required. The following snippet decides the url that should be passed
        # to the async client
        if self.request.uri.startswith(self.request.protocol,
                                       0):  # Normal Proxy Request
            self.request.url = self.request.uri
        else:  # Transparent Proxy Request
            self.request.url = self.request.protocol + "://" + self.request.host
            if self.request.uri != '/':  # Add uri only if needed
                self.request.url += self.request.uri

        # This block here checks for already cached response and if present returns one
        self.cache_handler = CacheHandler(self.application.cache_dir,
                                          self.request,
                                          self.application.cookie_regex,
                                          self.application.cookie_blacklist)
        request_hash = yield tornado.gen.Task(
            self.cache_handler.calculate_hash)
        self.cached_response = self.cache_handler.load()

        if self.cached_response:

            if self.cached_response.body:
                self.write(self.cached_response.body)
            self.finish_response(self.cached_response)

        else:

            # Request header cleaning
            for header in restricted_request_headers:
                try:
                    del self.request.headers[header]
                except:
                    continue

            # HTTP auth if exists
            http_auth_username = None
            http_auth_password = None
            http_auth_mode = None
            if self.application.http_auth:
                # HTTP AUTH settings
                host = self.request.host
                # If default ports are not provided, they are added
                try:
                    test = self.request.host.index(':')
                except ValueError:
                    default_ports = {'http': '80', 'https': '443'}
                    try:
                        host = self.request.host + ':' + default_ports[
                            self.request.protocol]
                    except KeyError:
                        pass
                # Check if auth is provided for that host
                try:
                    index = self.application.http_auth_hosts.index(host)
                    http_auth_username = self.application.http_auth_usernames[
                        index]
                    http_auth_password = self.application.http_auth_passwords[
                        index]
                    http_auth_mode = self.application.http_auth_modes[index]
                except ValueError:
                    pass

            # pycurl is needed for curl client
            async_client = tornado.curl_httpclient.CurlAsyncHTTPClient()
            # httprequest object is created and then passed to async client with a callback
            success_response = False  # is used to check the response in the botnet mode

            while not success_response:
                #Proxy Switching (botnet_mode) code
                if self.application.proxy_manager:
                    proxy = self.application.proxy_manager.get_next_available_proxy(
                    )
                    #print proxy
                    self.application.outbound_ip = proxy["proxy"][0]
                    self.application.outbound_port = int(proxy["proxy"][1])

                #  httprequest object is created and then passed to async client with a callback
                request = tornado.httpclient.HTTPRequest(
                        url=self.request.url,
                        method=self.request.method,
                        body=self.request.body if self.request.body else None,
                        headers=self.request.headers,
                        auth_username=http_auth_username,
                        auth_password=http_auth_password,
                        auth_mode=http_auth_mode,
                        follow_redirects=False,
                        use_gzip=True,
                        streaming_callback=self.handle_data_chunk,
                        header_callback=None,
                        proxy_host=self.application.outbound_ip,
                        proxy_port=self.application.outbound_port,
                        proxy_username=self.application.outbound_username,
                        proxy_password=self.application.outbound_password,
                        allow_nonstandard_methods=True,
                        prepare_curl_callback=prepare_curl_callback if self.application.outbound_proxy_type == "socks"\
                                                                    else None, # socks callback function
                        validate_cert=False)
                try:
                    response = yield tornado.gen.Task(async_client.fetch,
                                                      request)
                except Exception:
                    response = None
                    pass
                # Request retries
                for i in range(0, 3):
                    if (response is None) or response.code in [408, 599]:
                        self.request.response_buffer = ''
                        response = yield tornado.gen.Task(
                            async_client.fetch, request)
                    else:
                        success_response = True
                        break

                #botnet mode code (proxy switching)
                #checking the status of the proxy (asynchronous)
                if self.application.proxy_manager and not success_response:
                    proxy_check_req = tornado.httpclient.HTTPRequest(
                    url=self.application.proxy_manager.testing_url, #testing url is google.com
                        use_gzip=True,
                        proxy_host=self.application.outbound_ip,
                        proxy_port=self.application.outbound_port,
                        proxy_username=self.application.outbound_username,
                        proxy_password=self.application.outbound_password,
                        prepare_curl_callback=prepare_curl_callback if self.application.outbound_proxy_type == "socks"\
                        else None, # socks callback function
                        validate_cert=False)
                    try:
                        proxy_check_resp = yield tornado.gen.Task(
                            async_client.fetch, proxy_check_req)
                    except Exception:
                        pass

                    if proxy_check_resp.code != 200:
                        #self.application.proxy_manager.remove_proxy(proxy)
                        self.application.proxy_manager.remove_proxy(
                            proxy["index"])
                    else:
                        success_response = True
                else:
                    success_response = True

            self.finish_response(response)
            # Cache the response after finishing the response, so caching time is not included in response time
            self.cache_handler.dump(response)
예제 #7
0
            for feature in config['features']:
                df.loc[i, feature['column']] = res[feature['name']]

        except Exception as e:
            logging.error("Failed to geocode %s (%s)" % (loc, e))
    
    
if __name__ == '__main__':
    config = get_config()
    geocode_function = get_geocode_function(config)
    input_df = config['input_ds'].get_dataframe()

    writer = None

    try:
        with CacheHandler(config['cache_location'], enabled=config['cache_enabled'], \
                          size_limit=config['cache_size'], eviction_policy=config['cache_eviction']) as cache:
            for current_df in config['input_ds'].iter_dataframes(chunksize=max(10000, config['batch_size'])):
                columns = current_df.columns.tolist()

                columns_to_append = [f['column'] for f in config['features'] if not f['column'] in columns]
                if columns_to_append:
                    index = max(columns.index(config['lat_column']), columns.index(config['lng_column']))
                    current_df = current_df.reindex(columns = columns[:index + 1] + columns_to_append + columns[index + 1:], copy=False)

                if not config['batch_enabled']:
                    results = zip(*current_df.apply(perform_geocode, axis=1, args=(config, geocode_function, cache)))

                    for feature, result in zip(config['features'], results):
                        current_df[feature['column']] = result

                else:
예제 #8
0
    from throttle import Throttle
    throttle = Throttle()
except ImportError:
    throttle = None

app = Flask(__name__)

app.config.from_object('config.ConfigProduction')

cache = Cache(app, config={'CACHE_TYPE': 'simple'})
sentry = Sentry(app)

# db.init_app(app) # JL HACK ~ disable mysql

# Optional Redis cache, for caching Google spreadsheet campaign overrides
cache_handler = CacheHandler(app.config['REDIS_URL'])

# FFTF Leaderboard handler. Only used if FFTF Leadboard params are passed in
leaderboard = FFTFLeaderboard(app.debug, app.config['FFTF_LB_ASYNC_POOL_SIZE'],
                              app.config['FFTF_CALL_LOG_API_KEY'])

call_methods = ['GET', 'POST']

data = PoliticalData(cache_handler, app.debug)

print "Call Congress is starting up!"


def make_cache_key(*args, **kwargs):
    path = request.path
    args = str(hash(frozenset(request.args.items())))
예제 #9
0
import json
from flask import Flask, request, jsonify
from flask_cors import CORS
from cache_handler import CacheHandler
from slugify import slugify
from db_controller import DbController

app = Flask(__name__)
db = CacheHandler()
app.config["DEBUG"] = True
CORS(app)


@app.route("/test")
def test():
    return "test"


@app.route("/search")
def get_data():
    keys = db.get_keys()
    source = slugify(request.args.get("source", ""), separator="-")
    destination = slugify(request.args.get("destination", ""), separator="-")
    price = int(request.args.get("price", "9999999999999"))

    if source:
        keys = filter(lambda key: key.find(source) == 9, keys)

    if destination:
        keys = filter(lambda key: key.find(destination) > 9, keys)
예제 #10
0
from requests_html import HTMLSession
from journey import Journey
from cache_handler import CacheHandler
from website_parser import WebsiteParser
import fire

session = HTMLSession()

cache_handler = CacheHandler()


class Connections:

    session = HTMLSession()
    data = {
        "post-type": "shop",
        "currentstepnumber": "1",
        "search-from": "Split",
        "search-to": "Zagreb",
        "search-datetime": "21.10.2018.",
        "ticket-type": "oneway",
    }

    url = "https://www.arriva.com.hr/hr-hr/odabir-polaska"

    def cache(self, output):
        cache_handler.add_journeys(
            self.data["search-from"],
            self.data["search-to"],
            self.data["search-datetime"],
            output,
예제 #11
0
    def get(self):
        """
        * This function handles all requests except the connect request.
        * Once ssl stream is formed between browser and proxy, the requests are
          then processed by this function
        """
        self.request.response_buffer = ''
        # Data for handling headers through a streaming callback
        # Need to work around for something
        restricted_response_headers = ['Content-Length',
                            'Content-Encoding',
                            'Etag',
                            'Transfer-Encoding',
                            'Connection',
                            'Vary',
                            'Accept-Ranges',
                            'Pragma']

        # This function is a callback after the async client gets the full response
        # This method will be improvised with more headers from original responses
        def handle_response(response):
            self.set_status(response.code)
            del self._headers['Server']
            for header, value in list(response.headers.items()):
                if header == "Set-Cookie":
                    self.add_header(header, value)
                else:
                    if header not in restricted_response_headers:
                        self.set_header(header, value)
            if self.request.response_buffer:
                self.cache_handler.dump(response)
            self.finish()
            
        def handle_cached_response(response):
            self.set_status(response.code)
            for header, value in list(response.headers.items()):
                if header == "Set-Cookie":
                    self.add_header(header, value)
                else:
                    if header not in restricted_response_headers:
                        self.set_header(header, value)
            self.write(response.body)
            self.finish()            

        # This function is a callback when a small chunk is received
        def handle_data_chunk(data):
            if data:
                self.write(data)
                self.request.response_buffer += data

        # More headers are to be removed
        for header in ('Connection', 'Pragma', 'Cache-Control', 'If-Modified-Since'):
            try:
                del self.request.headers[header]
            except:
                continue

        # The requests that come through ssl streams are relative requests, so transparent
        # proxying is required. The following snippet decides the url that should be passed
        # to the async client
        if self.request.host in self.request.uri.split('/'):  # Normal Proxy Request
            self.request.url = self.request.uri
        else:  # Transparent Proxy Request
            self.request.url = self.request.protocol + "://" + self.request.host + self.request.uri

        # This block here checks for already cached response and if present returns one
        self.cache_handler = CacheHandler(
                                            self.application.cache_dir,
                                            self.request,
                                            self.application.cookie_regex,
                                            self.application.cookie_blacklist
                                         )
        cached_response = self.cache_handler.load()
        
        if cached_response:
            handle_cached_response(cached_response)
        else:
            # httprequest object is created and then passed to async client with a callback
            # pycurl is needed for curl client
            async_client = tornado.curl_httpclient.CurlAsyncHTTPClient()
            request = tornado.httpclient.HTTPRequest(
                    url=self.request.url,
                    method=self.request.method,
                    body=self.request.body,
                    headers=self.request.headers,
                    follow_redirects=False,
                    use_gzip=True,
                    streaming_callback=handle_data_chunk,
                    header_callback=None,
                    proxy_host=self.application.outbound_ip,
                    proxy_port=self.application.outbound_port,
                    proxy_username=self.application.outbound_username,
                    proxy_password=self.application.outbound_password,
                    allow_nonstandard_methods=True,
                    validate_cert=False)

            try:
                async_client.fetch(request, callback=handle_response)
            except Exception:
                pass