def getEntity(identity, entityid): try: entity = Item.get("entity", entityid) bot = Item.get("bot", entity.parent) except Exception: raise errors.ObjectNotFound(entityid) print ENTITYSTATES.PARSING, ">>", entity.doc.status, ENTITYSTATES.PARSING == entity.doc.status if entity.doc.status == ENTITYSTATES.PARSING: logger().info("entity is parsing") if len(entity.doc.crawler): crawler_name = entity.doc.crawler crawler = Crawler.get(name=crawler_name) if crawler["State"] in ["READY", "STOPPING"]: doc = bot.doc database = doc.database tablename = entity.doc.tablename columns = Crawler.get_table(database, tablename) print "columns", columns entity.update(actions=[ Item.doc.columns.set(columns), Item.doc.status.set(ENTITYSTATES.READY) ]) entity_doc = entity.json() print "entity_doc ", pprint.pformat(entity_doc) entity_doc.update(entity_doc["doc"]) del entity_doc["doc"] return entity_doc
def create(cls, **kwargs): #ts = datetime.now().strftime("%Y%m%d%H%M%S") #db = kwargs["database"] #name = "airbot%(db)s%(ts)s" % vars() #kwargs.update({"name" : name}) client = cls.glue() targets = { 'S3Targets': [{ 'Path': 's3://%(bucket)s' % kwargs + ("/%(prefix)s" % kwargs if "prefix" in kwargs.keys() else "") }] } try: logger().info( "Creating crawler `%s` for bucket `%s` path `%s` and updating database `%s`", kwargs["name"], kwargs["bucket"], kwargs.get("prefix", "*"), kwargs["database"]) response = client.create_crawler( Name=kwargs["name"], Role= "arn:aws:iam::950130011294:role/service-role/AWSGlueServiceRole-airbot", DatabaseName=kwargs["database"], #Schedule='cron(* /5 * * * *)', Description='Crawler for database %(database)s' % kwargs, Targets=targets) return response except Exception, e: logger().error("Following error occured when creating crawler %s", str(e)) return {"success": False, "data": e}
def refreshSchema(entityid, identity): entity = Item.get("entity", entityid) print "??", entity.doc.attribute_values print "==>", entity.doc.status if entity.doc.status == ENTITYSTATES.PARSING: return False logger().info("Creating crawler ") bot = Item.get("bot", entity.parent) doc = bot.doc database = doc["database"] tablename = entity.name name = Crawler.crawl_bucket(database=doc["database"], bucket=doc["bucket"], prefix=doc["prefix"] + "/" + entity.name) done = False nbtry = 10 entity.update(actions=[ Item.doc.status.set(ENTITYSTATES.PARSING), Item.doc.crawler.set(name) ]) return True
def preview(entityid, identity): logger().info("Preview of %s", entityid) entity = Item.get("entity", entityid) bot = Item.get("bot", entity.parent) doc = bot.doc error = True logger().info(" Querying ", ) database = doc.database tablename = entity.doc.tablename sql = 'select * from "%(database)s"."%(tablename)s" limit 100;' % vars() logger().info("SQL = %s", sql) response = AthenaQuery.run(**{"sql": sql}) #response["data"]["schema"] = response["data"]["records"][0].keys() logger().info("Response = %s", pprint.pformat(response)) return json.dumps(response["data"]["records"])
def preview(entityid, identity): logger().info("Preview of %s", entityid) entity = Item.get("entity", entityid) if entity.doc.status != ENTITYSTATES.READY: raise errors.FileStatusError("File has not been parsed") bot = Item.get("bot", entity.parent) doc = bot.doc error = True logger().info(" Querying ", ) database = doc.database tablename = entity.doc.tablename sql = 'select * from "%(database)s"."%(tablename)s" limit 100;' % vars() logger().info("SQL = %s", sql) response = AthenaQuery.run(**{"sql": sql}) logger().info("Response = %s,", pprint.pformat(response)) return response
def createOrUpdateLexBot(identity, botid): bot = Item.get("bot", botid) entities = Item.query("entity", Item.parent.__eq__(botid)) botgraph = {"slots": {}, "intents": {}} for e in entities: botgraph["slots"][e.name] = { "name": e.name, "ref": "{%(name)s}" % e.attribute_values, "values": e.doc.aliases, "type": "entity" } variables = Item.query("variable", Item.parent.__eq__(e.ID)) for v in variables: nameslot = v.name botgraph["slots"][nameslot] = { "name": nameslot, "ref": "{%(name)s}" % v.attribute_values, "values": v.doc.aliases, #json.loads(v.doc)["aliases"] "type": "variable" } opslot = v.name + "op" botgraph["slots"][opslot] = { "name": opslot, "ref": "{%(name)sop}" % v.attribute_values, "type": "operator", "values": [ "in", "outside", "greater", "equals", "different", "smaller", "bigger", "taller" ] } valslot = v.name + "val" botgraph["slots"][valslot] = { "type": "value", "name": valslot, "ref": "{%(name)sval}" % v.attribute_values, "values": list(get_sample_values(v.ID)) #sample_values(v.ID) } intents = Item.query("intent", Item.parent.__eq__(botid)) for i in intents: botgraph["intents"][i.name] = {} #print " ",i rules = Item.query("rule", Item.parent.__eq__(i.ID)) for r in rules: botgraph["intents"][i.name][ r.name] = r.doc.replacements #json.loads(r.doc)["expressions"] #print "~BG~%~~"*20 #print pprint.pformat(botgraph) logger().info("Creating bot %s", botid) intents = [] for intent in botgraph["intents"].keys(): logger().critical("Adding Intent %s", intent) intent_config = get_intent_config(intent, botgraph, "lex") #print "** ** ** "*30 #print pprint.pformat(intent_config) #print "-- -- -- "*30 intents.append(intent_config) for slot in intent_config["slots"]: logger().critical("Adding Slot Type `%s`", slot["name"]) #print pprint.pformat(slot["enumerationValues"]) Bot.add_slot_type(slot["name"], slot["enumerationValues"]) logger().critical("Saving Intent `%s` to Lex", intent) Bot.add_intent(intent_config) logger().info("Putting Bot %s", bot.name) Bot.build(bot.name, intents=[i["name"] for i in intents]) logger().info("Putting Bot Alias %s %s", bot.name, "demo") Bot.put_alias(bot.name) Bot.put_bot_version(bot.name) return botgraph
def start(cls, **kwargs): logger().info("Starting Crawler %s ", kwargs["name"]) client = cls.glue() return client.start_crawler(Name=kwargs["name"])
return {"success": False, "data": e} @classmethod def start(cls, **kwargs): logger().info("Starting Crawler %s ", kwargs["name"]) client = cls.glue() return client.start_crawler(Name=kwargs["name"]) @classmethod def crawl_bucket(cls, **kwargs): db = kwargs["database"] ts = datetime.now().strftime("%Y%m%d%H%M%S") name = "airbot%(db)s%(ts)s" % vars() kwargs.update({"name": name}) cls.create(**kwargs) cls.start(**kwargs) return name if __name__ == "__main__": k = Crawler.crawl_bucket(**{ "database": "mybot", "bucket": "airbot2018", "force": True }) print k logger().info("Crawler %s", str(Crawler.get(name="airbotmybot20180922185001"))) #Crawler.delete(name=k)