def run(): yt_json_source = "./creds/client_secret.json" sp_creds_source = "./creds/secrets2.txt" initializer = Glue(yt_json_source, sp_creds_source) initializer.generate("Summer Mix 2017")
def __get_partitions(): to_delete = [] db_name = "" table_name = "" glue = Glue() token = None partitions_to_delete = [] while True: response = glue.get_partitions(db_name, table_name, token) token = response["NextToken"] if "NextToken" in response else None partitions = response["Partitions"] idx = 0 for partition in partitions: idx += 1 values = partition["Values"] if idx >= 25: glue.delete_partitions(db_name, table_name, partitions_to_delete) partitions_to_delete = [] idx = 0 partitions_to_delete.append({"Values": values}) if token is None or len(token) == 0: break response = glue.delete_partitions(db_name, table_name, partitions_to_delete)
def update_glue_crawler_datastores(context, datastores): global glue_crawler_response glue = Glue() crawler_name = glue.get_crawler_name(context[c.KEY_LAMBDA_FUNCTION]) if not glue_crawler_response: glue_crawler_response = glue.get_crawler(crawler_name) if glue_crawler_response is not None: bucket = "s3://{}/".format(os.environ[c.RES_S3_STORAGE]) path_format = "s3://{}/{}".format(os.environ[c.RES_S3_STORAGE], "{}") srcs = [] if len(glue_crawler_response['Crawler']['Targets']['S3Targets']) > 0: for s3target in glue_crawler_response['Crawler']['Targets'][ 'S3Targets']: table = s3target['Path'].replace(bucket, '').lower() if table in datastores: del datastores[table] srcs.append(s3target) if len(datastores) == 0: return for table in datastores: srcs.append({'Path': path_format.format(table), 'Exclusions': []}) print "Defining GLUE datastores" db_name = athena.get_database_name(os.environ[c.ENV_S3_STORAGE]) table_prefix = athena.get_table_prefix(os.environ[c.ENV_S3_STORAGE]) glue.update_crawler(crawler_name, os.environ[c.ENV_SERVICE_ROLE], db_name, table_prefix, srcs=srcs)
def handler(event, context): print "Start Glue" stack_id = event[c.ENV_STACK_ID] resources = util.get_stack_resources(stack_id) request_type = event['RequestType'] db_name = athena.get_database_name(stack_id, False) glue = Glue() for resource in resources: if resource.logical_id == c.RES_SERVICE_ROLE: role_name = resource.physical_id if resource.logical_id == c.RES_S3_STORAGE: storage_physical_id = resource.physical_id if role_name is None: raise errors.ClientError("The logical resource '{}' was not found. Is the resource in the cloud formation stack?".format(c.RES_SERVICE_ROLE)) if storage_physical_id is None: raise errors.ClientError("The logical resource '{}' was not found. Is the resource in the cloud formation stack?".format(c.RES_S3_STORAGE)) crawler_id_1 = glue.get_crawler_name(stack_id) srcs = [ { 'Path': "{}/{}{}".format(storage_physical_id, "table=", DEFAULT_EVENTS.CLIENTINITCOMPLETE), 'Exclusions': [] }, { 'Path': "{}/{}{}".format(storage_physical_id, "table=", DEFAULT_EVENTS.SESSIONSTART), 'Exclusions': [] } ] print request_type, db_name, crawler_id_1, "role: ", role_name, "s3: ", storage_physical_id if request_type.lower() == 'delete': if glue.get_crawler(crawler_id_1) is not None: glue.stop_crawler(crawler_id_1) glue.delete_crawler(crawler_id_1) if glue.database_exists(db_name): glue.delete_database(db_name) elif request_type.lower() == 'create': if not glue.database_exists(db_name): glue.create_database(db_name) if glue.get_crawler(crawler_id_1) is None: glue.create_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id), srcs=srcs ) else: if glue.get_crawler(crawler_id_1) is None: glue.create_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id), srcs=srcs ) else: glue.stop_crawler(crawler_id_1) glue.update_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id) ) return custom_resource_response.success_response({}, "*")
def main(event, request): context = dict({}) context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr( request, 'function_name') else None context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr( request, 'aws_request_id') else None stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN] context[c.KEY_DB] = DynamoDb(context) context[c.KEY_ATHENA_QUERY] = Query(stackid) context[c.KEY_GLUE_CRAWLER] = Glue() thread_pool = ThreadPool(size=3) crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid) crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE]) glue = Glue() events = glue.get_events() start = datetime.datetime.utcnow() - datetime.timedelta(hours=2) now = datetime.datetime.utcnow() found = False for type in events: dt = start while dt <= now: prefix = metric_schema.s3_key_format().format( context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day, dt.hour, type, dt.strftime(util.partition_date_format())) found = crawler.exists(prefix) if found: print "FOUND new events=>", prefix break dt += timedelta(hours=1) if found: break if found: thread_pool.add(crawl, context, crawler_name, context[c.KEY_ATHENA_QUERY].execute_with_format) thread_pool.wait() return custom_resource_response.success_response({}, "*")
def __update_partitions2(paths): to_delete = [] db_name = "" table_name = "" for path in paths: to_delete.append(path.source) to_delete.append(path.buildid) to_delete.append(str(path.year)) to_delete.append(str(path.month)) to_delete.append(str(path.day)) to_delete.append(str(path.hour)) to_delete.append(path.platform) to_delete.append(path.event) to_delete.append(path.schema) break glue = Glue() print glue.delete_partitions(db_name, table_name, to_delete)
def launch(event, lambdacontext): print "Start" hours_delta = 36 context = dict({}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr( lambdacontext, 'aws_request_id') else None global threadpool global is_lambda threadpool = ThreadPool(context, 8) is_lambda = context[c.KEY_REQUEST_ID] is not None available_amoeba_lambdas = [] available_amoeba_lambdas.append(c.ENV_AMOEBA_1) available_amoeba_lambdas.append(c.ENV_AMOEBA_2) available_amoeba_lambdas.append(c.ENV_AMOEBA_3) available_amoeba_lambdas.append(c.ENV_AMOEBA_4) available_amoeba_lambdas.append(c.ENV_AMOEBA_5) db = DynamoDb(context) crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE]) glue = Glue() events = glue.get_events() #TODO: adjust the amoeba tree depth so that we have fully utilized all available amoebas; len(available_amoeba_lambdas) * 1000 #since the number of leaf nodes for the metric partitions can quickly get very large we use a 5 lambda pool to ensure we don't hit the 1000 invocation limit. start = datetime.datetime.utcnow() - datetime.timedelta(hours=hours_delta) now = datetime.datetime.utcnow() for type in events: dt = start while dt <= now: prefix = metric_schema.s3_key_format().format( context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day, dt.hour, type, dt.strftime(util.partition_date_format())) threadpool.add(crawler.crawl, prefix, available_amoeba_lambdas, invoke_lambda) dt += timedelta(hours=1) threadpool.wait() return custom_resource_response.success_response({"StatusCode": 200}, "*")
# -*- coding: utf-8 -*- import os, sys, time import numpy as np sys.path.append(os.path.split(os.getcwd())[0]) from glue import Glue glue = Glue() glue.start()
def get_crawler_status(request, name): glue_crawler = Glue() response = glue_crawler.get_crawler(name.replace('-', '_')) return custom_resource_response.success_response({ "State": response['Crawler']['State']}, "*")
def events(request): glue = Glue() return glue.get_events()
def start_crawler(event, context): glue = Glue() crawler_id_1 = glue.get_crawler_name(event) thread_pool = ThreadPool() thread_pool.add(glue.start_crawler, crawler_id_1) thread_pool.wait()
channel = socket.Channel('test') channel.on_message(on_message_test) def on_message(socket, data, channel): """ onMessage for main channel """ print('Received data:', data) socket.sendToMain('Hello world!') def on_message_test(chan, data): print('Channel: %s' % chan.channel, 'Data: %s' % data) chan.send(data) if __name__ == '__main__': import sys from twisted.python import log from twisted.internet import reactor log.startLogging(sys.stdout) glue = Glue(u"ws://127.0.0.1:8081/glue/ws") glue.factory.protocol = CustomGlueProtocol reactor.run()
import os import sys import db from glue import Glue, unquote_gluestring from config import Config import re if len(sys.argv) < 2: print('No config file provided') exit(1) cfg = Config(sys.argv[1]) glue = Glue(staging=cfg.staging) glue.login(cfg.username, cfg.password) glue.connect_to_host(cfg.company_id) company = glue.get_company(cfg.company_id) project = glue.get_project(cfg.project_name) if cfg.dry: print('Info: dry run enabled') # get project notification settings project_settings = glue.get_project_user_notification_settings( project.project_id) notification_settings = ';'.join([ x for x in project_settings['notification_settings'] if project_settings['notification_settings'][x] == '1' ])