Ejemplo n.º 1
0
def getEntity(identity, entityid):
    try:
        entity = Item.get("entity", entityid)
        bot = Item.get("bot", entity.parent)
    except Exception:
        raise errors.ObjectNotFound(entityid)

    print ENTITYSTATES.PARSING, ">>", entity.doc.status, ENTITYSTATES.PARSING == entity.doc.status
    if entity.doc.status == ENTITYSTATES.PARSING:
        logger().info("entity is parsing")
        if len(entity.doc.crawler):
            crawler_name = entity.doc.crawler
            crawler = Crawler.get(name=crawler_name)
            if crawler["State"] in ["READY", "STOPPING"]:
                doc = bot.doc
                database = doc.database
                tablename = entity.doc.tablename
                columns = Crawler.get_table(database, tablename)
                print "columns", columns
                entity.update(actions=[
                    Item.doc.columns.set(columns),
                    Item.doc.status.set(ENTITYSTATES.READY)
                ])
    entity_doc = entity.json()
    print "entity_doc ", pprint.pformat(entity_doc)
    entity_doc.update(entity_doc["doc"])
    del entity_doc["doc"]
    return entity_doc
Ejemplo n.º 2
0
    def create(cls, **kwargs):
        #ts = datetime.now().strftime("%Y%m%d%H%M%S")
        #db = kwargs["database"]
        #name = "airbot%(db)s%(ts)s" % vars()
        #kwargs.update({"name" : name})

        client = cls.glue()
        targets = {
            'S3Targets': [{
                'Path':
                's3://%(bucket)s' % kwargs +
                ("/%(prefix)s" % kwargs if "prefix" in kwargs.keys() else "")
            }]
        }
        try:
            logger().info(
                "Creating crawler `%s` for bucket `%s` path `%s` and updating database `%s`",
                kwargs["name"], kwargs["bucket"], kwargs.get("prefix", "*"),
                kwargs["database"])
            response = client.create_crawler(
                Name=kwargs["name"],
                Role=
                "arn:aws:iam::950130011294:role/service-role/AWSGlueServiceRole-airbot",
                DatabaseName=kwargs["database"],
                #Schedule='cron(* /5 * * * *)',
                Description='Crawler for database %(database)s' % kwargs,
                Targets=targets)
            return response
        except Exception, e:
            logger().error("Following error occured when creating crawler %s",
                           str(e))
            return {"success": False, "data": e}
Ejemplo n.º 3
0
def refreshSchema(entityid, identity):
    entity = Item.get("entity", entityid)
    print "??", entity.doc.attribute_values
    print "==>", entity.doc.status
    if entity.doc.status == ENTITYSTATES.PARSING:
        return False
    logger().info("Creating  crawler ")
    bot = Item.get("bot", entity.parent)
    doc = bot.doc
    database = doc["database"]
    tablename = entity.name

    name = Crawler.crawl_bucket(database=doc["database"],
                                bucket=doc["bucket"],
                                prefix=doc["prefix"] + "/" + entity.name)
    done = False
    nbtry = 10
    entity.update(actions=[
        Item.doc.status.set(ENTITYSTATES.PARSING),
        Item.doc.crawler.set(name)
    ])
    return True
Ejemplo n.º 4
0
def preview(entityid, identity):
    logger().info("Preview of %s", entityid)
    entity = Item.get("entity", entityid)
    bot = Item.get("bot", entity.parent)
    doc = bot.doc
    error = True
    logger().info("     Querying ", )
    database = doc.database
    tablename = entity.doc.tablename
    sql = 'select *  from "%(database)s"."%(tablename)s" limit 100;' % vars()
    logger().info("SQL = %s", sql)
    response = AthenaQuery.run(**{"sql": sql})
    #response["data"]["schema"] = response["data"]["records"][0].keys()
    logger().info("Response = %s", pprint.pformat(response))
    return json.dumps(response["data"]["records"])
Ejemplo n.º 5
0
def preview(entityid, identity):
    logger().info("Preview of %s", entityid)
    entity = Item.get("entity", entityid)
    if entity.doc.status != ENTITYSTATES.READY:
        raise errors.FileStatusError("File has not been parsed")

    bot = Item.get("bot", entity.parent)
    doc = bot.doc
    error = True
    logger().info("     Querying ", )
    database = doc.database
    tablename = entity.doc.tablename
    sql = 'select *  from "%(database)s"."%(tablename)s" limit 100;' % vars()
    logger().info("SQL = %s", sql)
    response = AthenaQuery.run(**{"sql": sql})
    logger().info("Response = %s,", pprint.pformat(response))
    return response
Ejemplo n.º 6
0
def createOrUpdateLexBot(identity, botid):
    bot = Item.get("bot", botid)
    entities = Item.query("entity", Item.parent.__eq__(botid))
    botgraph = {"slots": {}, "intents": {}}
    for e in entities:
        botgraph["slots"][e.name] = {
            "name": e.name,
            "ref": "{%(name)s}" % e.attribute_values,
            "values": e.doc.aliases,
            "type": "entity"
        }
        variables = Item.query("variable", Item.parent.__eq__(e.ID))
        for v in variables:
            nameslot = v.name
            botgraph["slots"][nameslot] = {
                "name": nameslot,
                "ref": "{%(name)s}" % v.attribute_values,
                "values": v.doc.aliases,  #json.loads(v.doc)["aliases"]
                "type": "variable"
            }
            opslot = v.name + "op"
            botgraph["slots"][opslot] = {
                "name":
                opslot,
                "ref":
                "{%(name)sop}" % v.attribute_values,
                "type":
                "operator",
                "values": [
                    "in", "outside", "greater", "equals", "different",
                    "smaller", "bigger", "taller"
                ]
            }

            valslot = v.name + "val"
            botgraph["slots"][valslot] = {
                "type": "value",
                "name": valslot,
                "ref": "{%(name)sval}" % v.attribute_values,
                "values": list(get_sample_values(v.ID))  #sample_values(v.ID)
            }

    intents = Item.query("intent", Item.parent.__eq__(botid))
    for i in intents:
        botgraph["intents"][i.name] = {}
        #print " ",i
        rules = Item.query("rule", Item.parent.__eq__(i.ID))
        for r in rules:
            botgraph["intents"][i.name][
                r.name] = r.doc.replacements  #json.loads(r.doc)["expressions"]

    #print "~BG~%~~"*20
    #print pprint.pformat(botgraph)
    logger().info("Creating bot %s", botid)
    intents = []
    for intent in botgraph["intents"].keys():
        logger().critical("Adding Intent %s", intent)
        intent_config = get_intent_config(intent, botgraph, "lex")
        #print "** ** ** "*30
        #print pprint.pformat(intent_config)
        #print "-- -- --  "*30
        intents.append(intent_config)
        for slot in intent_config["slots"]:
            logger().critical("Adding Slot Type `%s`", slot["name"])
            #print pprint.pformat(slot["enumerationValues"])
            Bot.add_slot_type(slot["name"], slot["enumerationValues"])

        logger().critical("Saving Intent `%s` to Lex", intent)
        Bot.add_intent(intent_config)

    logger().info("Putting Bot %s", bot.name)
    Bot.build(bot.name, intents=[i["name"] for i in intents])
    logger().info("Putting Bot Alias %s %s", bot.name, "demo")
    Bot.put_alias(bot.name)
    Bot.put_bot_version(bot.name)
    return botgraph
Ejemplo n.º 7
0
 def start(cls, **kwargs):
     logger().info("Starting Crawler %s ", kwargs["name"])
     client = cls.glue()
     return client.start_crawler(Name=kwargs["name"])
Ejemplo n.º 8
0
            return {"success": False, "data": e}

    @classmethod
    def start(cls, **kwargs):
        logger().info("Starting Crawler %s ", kwargs["name"])
        client = cls.glue()
        return client.start_crawler(Name=kwargs["name"])

    @classmethod
    def crawl_bucket(cls, **kwargs):
        db = kwargs["database"]
        ts = datetime.now().strftime("%Y%m%d%H%M%S")
        name = "airbot%(db)s%(ts)s" % vars()
        kwargs.update({"name": name})
        cls.create(**kwargs)
        cls.start(**kwargs)
        return name


if __name__ == "__main__":
    k = Crawler.crawl_bucket(**{
        "database": "mybot",
        "bucket": "airbot2018",
        "force": True
    })
    print k
    logger().info("Crawler %s",
                  str(Crawler.get(name="airbotmybot20180922185001")))

    #Crawler.delete(name=k)