Beispiel #1
0
def run():

    yt_json_source = "./creds/client_secret.json"
    sp_creds_source = "./creds/secrets2.txt"

    initializer = Glue(yt_json_source, sp_creds_source)
    initializer.generate("Summer Mix 2017")
Beispiel #2
0
def __get_partitions():
    to_delete = []
    db_name = ""
    table_name = ""

    glue = Glue()
    token = None
    partitions_to_delete = []
    while True:
        response = glue.get_partitions(db_name, table_name, token)
        token = response["NextToken"] if "NextToken" in response else None
        partitions = response["Partitions"]
        idx = 0
        for partition in partitions:
            idx += 1
            values = partition["Values"]
            if idx >= 25:
                glue.delete_partitions(db_name, table_name,
                                       partitions_to_delete)
                partitions_to_delete = []
                idx = 0
            partitions_to_delete.append({"Values": values})
        if token is None or len(token) == 0:
            break
    response = glue.delete_partitions(db_name, table_name,
                                      partitions_to_delete)
Beispiel #3
0
def update_glue_crawler_datastores(context, datastores):
    global glue_crawler_response
    glue = Glue()
    crawler_name = glue.get_crawler_name(context[c.KEY_LAMBDA_FUNCTION])
    if not glue_crawler_response:
        glue_crawler_response = glue.get_crawler(crawler_name)
    if glue_crawler_response is not None:
        bucket = "s3://{}/".format(os.environ[c.RES_S3_STORAGE])
        path_format = "s3://{}/{}".format(os.environ[c.RES_S3_STORAGE], "{}")
        srcs = []
        if len(glue_crawler_response['Crawler']['Targets']['S3Targets']) > 0:
            for s3target in glue_crawler_response['Crawler']['Targets'][
                    'S3Targets']:
                table = s3target['Path'].replace(bucket, '').lower()
                if table in datastores:
                    del datastores[table]
                srcs.append(s3target)

        if len(datastores) == 0:
            return

        for table in datastores:
            srcs.append({'Path': path_format.format(table), 'Exclusions': []})
        print "Defining GLUE datastores"
        db_name = athena.get_database_name(os.environ[c.ENV_S3_STORAGE])
        table_prefix = athena.get_table_prefix(os.environ[c.ENV_S3_STORAGE])
        glue.update_crawler(crawler_name,
                            os.environ[c.ENV_SERVICE_ROLE],
                            db_name,
                            table_prefix,
                            srcs=srcs)
Beispiel #4
0
def handler(event, context):
    print "Start Glue"  
    stack_id = event[c.ENV_STACK_ID]
    resources = util.get_stack_resources(stack_id)  
    request_type = event['RequestType']
    db_name = athena.get_database_name(stack_id, False) 
    glue = Glue()  

    for resource in resources:        
        if resource.logical_id == c.RES_SERVICE_ROLE:
           role_name = resource.physical_id
        if resource.logical_id == c.RES_S3_STORAGE:
           storage_physical_id = resource.physical_id
    
    if role_name is None:
        raise errors.ClientError("The logical resource '{}' was not found.  Is the resource in the cloud formation stack?".format(c.RES_SERVICE_ROLE))   

    if storage_physical_id is None:
        raise errors.ClientError("The logical resource '{}' was not found.  Is the resource in the cloud formation stack?".format(c.RES_S3_STORAGE))           
    crawler_id_1 =  glue.get_crawler_name(stack_id)    
    srcs = [
                {
                    'Path': "{}/{}{}".format(storage_physical_id, "table=", DEFAULT_EVENTS.CLIENTINITCOMPLETE),
                    'Exclusions': []
                },
                {
                    'Path': "{}/{}{}".format(storage_physical_id, "table=", DEFAULT_EVENTS.SESSIONSTART),
                    'Exclusions': []
                }
            ]
      

    print request_type, db_name, crawler_id_1, "role: ", role_name, "s3: ", storage_physical_id
    if request_type.lower() == 'delete':
        if glue.get_crawler(crawler_id_1) is not None:       
            glue.stop_crawler(crawler_id_1) 
            glue.delete_crawler(crawler_id_1)

        if glue.database_exists(db_name):
            glue.delete_database(db_name)
    elif request_type.lower() == 'create':   
        if not glue.database_exists(db_name):
            glue.create_database(db_name)

        if glue.get_crawler(crawler_id_1) is None:
            glue.create_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id), srcs=srcs )

    else:                   
        if glue.get_crawler(crawler_id_1) is None:
            glue.create_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id), srcs=srcs )
        else:
            glue.stop_crawler(crawler_id_1) 
            glue.update_crawler(crawler_id_1, role_name, db_name, athena.get_table_prefix(stack_id) )
        
    return custom_resource_response.success_response({}, "*")
Beispiel #5
0
def main(event, request):
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr(
        request, 'function_name') else None
    context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr(
        request, 'aws_request_id') else None
    stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN]

    context[c.KEY_DB] = DynamoDb(context)
    context[c.KEY_ATHENA_QUERY] = Query(stackid)
    context[c.KEY_GLUE_CRAWLER] = Glue()
    thread_pool = ThreadPool(size=3)
    crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()
    events = glue.get_events()

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
    now = datetime.datetime.utcnow()

    found = False
    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            found = crawler.exists(prefix)
            if found:
                print "FOUND new events=>", prefix
                break
            dt += timedelta(hours=1)
        if found:
            break

    if found:
        thread_pool.add(crawl, context, crawler_name,
                        context[c.KEY_ATHENA_QUERY].execute_with_format)
        thread_pool.wait()

    return custom_resource_response.success_response({}, "*")
Beispiel #6
0
def __update_partitions2(paths):
    to_delete = []
    db_name = ""
    table_name = ""
    for path in paths:
        to_delete.append(path.source)
        to_delete.append(path.buildid)
        to_delete.append(str(path.year))
        to_delete.append(str(path.month))
        to_delete.append(str(path.day))
        to_delete.append(str(path.hour))
        to_delete.append(path.platform)
        to_delete.append(path.event)
        to_delete.append(path.schema)
        break

    glue = Glue()
    print glue.delete_partitions(db_name, table_name, to_delete)
Beispiel #7
0
def launch(event, lambdacontext):
    print "Start"
    hours_delta = 36
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(
        lambdacontext, 'aws_request_id') else None
    global threadpool
    global is_lambda
    threadpool = ThreadPool(context, 8)
    is_lambda = context[c.KEY_REQUEST_ID] is not None
    available_amoeba_lambdas = []
    available_amoeba_lambdas.append(c.ENV_AMOEBA_1)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_2)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_3)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_4)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_5)
    db = DynamoDb(context)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()

    events = glue.get_events()
    #TODO: adjust the amoeba tree depth so that we have fully utilized all available amoebas; len(available_amoeba_lambdas) * 1000
    #since the number of leaf nodes for the metric partitions can quickly get very large we use a 5 lambda pool to ensure we don't hit the 1000 invocation limit.

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=hours_delta)
    now = datetime.datetime.utcnow()

    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            threadpool.add(crawler.crawl, prefix, available_amoeba_lambdas,
                           invoke_lambda)
            dt += timedelta(hours=1)

    threadpool.wait()
    return custom_resource_response.success_response({"StatusCode": 200}, "*")
Beispiel #8
0
# -*- coding: utf-8 -*-
import os, sys, time
import numpy as np
sys.path.append(os.path.split(os.getcwd())[0])
from glue import Glue

glue = Glue()
glue.start()
Beispiel #9
0
def get_crawler_status(request, name):
    glue_crawler = Glue()    
    response = glue_crawler.get_crawler(name.replace('-', '_'))    
    return custom_resource_response.success_response({ "State": response['Crawler']['State']}, "*")
Beispiel #10
0
def events(request):  
    glue = Glue()    
    return glue.get_events()
Beispiel #11
0
def start_crawler(event, context):
    glue = Glue()
    crawler_id_1 = glue.get_crawler_name(event)
    thread_pool = ThreadPool()
    thread_pool.add(glue.start_crawler, crawler_id_1)
    thread_pool.wait()
Beispiel #12
0
        channel = socket.Channel('test')

        channel.on_message(on_message_test)

    def on_message(socket, data, channel):
        """
        onMessage for main channel
        """
        print('Received data:', data)
        socket.sendToMain('Hello world!')


def on_message_test(chan, data):
    print('Channel: %s' % chan.channel, 'Data: %s' % data)
    chan.send(data)


if __name__ == '__main__':

    import sys

    from twisted.python import log
    from twisted.internet import reactor

    log.startLogging(sys.stdout)

    glue = Glue(u"ws://127.0.0.1:8081/glue/ws")
    glue.factory.protocol = CustomGlueProtocol

    reactor.run()
Beispiel #13
0
import os
import sys
import db
from glue import Glue, unquote_gluestring
from config import Config
import re

if len(sys.argv) < 2:
    print('No config file provided')
    exit(1)

cfg = Config(sys.argv[1])

glue = Glue(staging=cfg.staging)
glue.login(cfg.username, cfg.password)

glue.connect_to_host(cfg.company_id)
company = glue.get_company(cfg.company_id)
project = glue.get_project(cfg.project_name)

if cfg.dry:
    print('Info: dry run enabled')

# get project notification settings
project_settings = glue.get_project_user_notification_settings(
    project.project_id)

notification_settings = ';'.join([
    x for x in project_settings['notification_settings']
    if project_settings['notification_settings'][x] == '1'
])