def orchestrator_function(context: df.DurableOrchestrationContext): orc_input = context.get_input() logger.info("Orchestration Input : {}".format(orc_input)) result1 = yield context.call_activity('pandas_transform', orc_input) return result1
async def main(mytimer: azure.functions.TimerRequest, starter: str): try: orchestrator_name = "F_orchestrator" client = df.DurableOrchestrationClient(starter) req_params = { 'trigger': 'scheduled', 'source': 'prestashop', 'last_days': '1', 'model': None, 'action': 'full' } req_body = { 'status': 'TODO' } orc_input = { 'params': req_params, 'body': req_body } instance_id = await client.start_new(orchestrator_name, None, req_params) logger.info(f"Started orchestration with ID = '{instance_id}'.") except Exception as e: logger.error("F_prestashop_timer :: {}".format(traceback.print_exc()))
def execute_query(self, query_name, query_conf): model_name, queryPipeline = build_mongo_query(query_conf) collection = self.db[model_name] logger.info("Executing mongo Query on Collection {}: {}".format( model_name, queryPipeline)) results = collection.aggregate(queryPipeline) results_list = list(results) logger.debug(results_list) result_dataset = { "header": { "schema": SCHEMA_NAME, "model": model_name, "query_name": query_name, "query_conf": query_conf, "count": len(results_list), }, "data": results_list } return result_dataset
def fetch(): orc_input = {'params': params, 'body': 'TODO'} result = fetch_data.main(params) logger.info(result) return result
def create_db(self, schemas=CONNECTOR_MAP.keys()): """Creates all Table Metadata and db tables corresponding to the given connectors' models definitions""" for schema in schemas: self.create_models(schema) AutoBase.metadata.create_all(self.engine) tables_list = list(x.name for x in AutoBase.metadata.sorted_tables) logger.info( "Successfully created database. Models: {}".format(tables_list)) result = {'schemas': schemas, 'created': tables_list} return result
def apply_transforms(self, transforms): source = transforms['Source'] table_list = transforms['Tables'] dataframes = self.load_tables(source, table_list) df = None for step in transforms['Steps']: step_name = step['Step'] logger.debug("STEP: {}".format(step)) try: logger.info("{}::{} - Executing Step".format( source, step_name)) operation = step['type'] params = step['params'] output_name = step['output'] # replace the dataframe names by the actual dataframes in the params input_name = step['input'] params['origin_df'] = dataframes[input_name] if 'right_input' in step.keys(): right_name = step['right_input'] params['right_df'] = dataframes[right_name] logger.debug("STEP PARAMS: {}".format(params)) # retrieve the right function to apply and pass the parameters as dict function = getattr(self, operation) df = function(**params) logger.debug(df.head(10)) # store the output in the buffer_dfs for further chaining dataframes[output_name] = df if 'save' in step.keys() and (step['save']): logger.info("Saving dataframe {}::{}".format( source, output_name)) self.save(df, source, output_name) except Exception as e: errmsg = "{}::{} error: {}".format(source, step_name, e) logger.error(errmsg) continue return df
def insert_dataset(self, dataset): header = dataset['header'] schema_name = header['schema'] model_name = header['model'] line_count = header['count'] data = dataset['data'] logger.info( "Inserting dataset to Mongo Collection: {}".format(model_name)) collection = self.db[model_name] result = collection.insert_many(data) return result
def delete_tables(self, schema, to_delete): AutoBase.prepare(engine=self.engine, schema=schema, reflect=True) tables_list = list(x.__table__ for x in AutoBase.classes if x.__table__.name in to_delete) logger.info("DROPPING tables from schema {}: {}".format( schema, to_delete)) AutoBase.metadata.drop_all(bind=self.engine, tables=tables_list) logger.info("Successfully dropped tables : {}".format(to_delete)) result = {'schema': schema, 'deleted': to_delete} return result
def create_models(self, schema, models_list=None): """Creates Tabls Metadata and db tables corresponding to the given connectors' models definitions""" connector = import_module(CONNECTOR_MAP[schema]) if models_list is None: models_list = connector.MODELS_LIST logger.info( "Creating MetadataClasses in schema {} from models: {}".format( schema, models_list)) for model_name in models_list: ORMclass = create_ORM_class(schema, model_name, connector.MODELS[model_name], connector.UNPACKING) AutoBase.metadata.create_all(bind=self.engine)
async def main(req: func.HttpRequest, starter: str) -> func.HttpResponse: try: client = df.DurableOrchestrationClient(starter) logger.info("request parameters: {}".format(req.params)) expected_params = [ 'last_days', 'source', 'model', 'action' ] # req_params = dict(req.params) params = {} # req_body = req.get_body() req_body = { 'status': 'TODO' } for key in expected_params: params[key] = (req.params[key] if key in req.params.keys() else None) params['trigger'] = 'http' models_raw = params['model'] params['model'] = (models_raw.split(',') if models_raw else None) orc_input = { 'params': params, 'body': req_body } instance_id = await client.start_new(req.route_params["functionName"], None, params) logger.info(f"Started orchestration with ID = '{instance_id}'.") return client.create_check_status_response(req, instance_id) except Exception as e: logger.error("F_starter :: {}".format(e))
def orchestrator_function(context: df.DurableOrchestrationContext): orc_input = context.get_input() logger.info("Orchestration Input : {}".format(orc_input)) action = orc_input['action'] result = {} if (action == 'fetch') or (action == 'full'): fetch_result = yield context.call_activity('fetch_data', orc_input) result['fetch_data'] = fetch_result if (action == 'transform') or (action == 'full'): transform_result = yield context.call_activity('pandas_transform', orc_input) result['pandas_transform'] = transform_result logger.info("Orchestration Results : {}".format(result)) return result
def extract(): # # import the right connector # package_name = 'Connectors.{}'.format(CONNECTOR_MAP[source]['package']) # connector_name = CONNECTOR_MAP[source]['connector'] # connector = import_module(connector_name, package_name) # # instantiate a connector client # client = getattr(connector,connector_name) client = get_client(source) full_results = [] for model_name in models: logger.info("Extracting schema: {} - model: {}".format( source, model_name)) jsonpath, dataset = client.get_data(model_name, last_days=params['last_days']) full_results += {'jsonpath': jsonpath, 'dataset': dataset}, return full_results
def execute_queries(self, query_names=None): queries = {} #extracting a subset of the MONGO_QUERIES dicitonary if query names were explicitly provided if query_names: queries = { query_name: MONGO_QUERIES[query_name] for query_name in set(query_names) } else: queries = MONGO_QUERIES for query_name, query_conf in queries.items(): logger.info("Preparing Mongo Query {}: {}".format( query_name, query_conf)) result_dataset = self.execute_query(query_name, query_conf) if DUMP_JSON: json_dump(result_dataset, APP_NAME, query_name) if query_conf['dump_csv']: csv_dump(result_dataset['data'], APP_NAME, query_name)
def main(params: dict) -> dict: returnStr = "" try: # params = orc_input['params'] pdconn = PandasSQLConnector.load_default() schema = params['source'] trigger = params['trigger'] results = {} initStr = "Extend Data Table operation started. Trigger : {} - Schema: {}".format(trigger,schema) logger.info(initStr) for filename in os.listdir(TRANSFORMS_DIR): transform_def = load_conf(filename, subfolder='transforms') if transform_def['Source'] == schema: logger.info("Applying pandas transforms from manifest: {}".format(filename)) df = pdconn.apply_transforms(transform_def) results[filename] = 'applied' else: logger.info("Skipping filtered schema : {}".format(transform_def)) results[filename] = 'skipped' returnStr = "Extend Data Table ended. Results: {}".format(results) logger.info(returnStr) output_results = { 'params': params, 'results': results } except Exception as e: returnStr = '{}'.format(e) logger.error(e) output_results = { 'params': params, 'results': returnStr } return output_results
def apply_changes(self, plan): """Applies the changes specified in a given 'plan' JSON file. This approach is pretty much inspired by Terraform, but applied to SQLAlchemy db models :)""" returnmsg = "" result = {} schema = plan['schema'] to_delete = plan['delete'] to_create = plan['create'] deletion = len(to_delete) > 0 creation = len(to_create) > 0 if deletion or creation: logger.info("DB CHANGE: Applying change plan: {}".format(plan)) try: # for documentation on this : refer to https://docs.sqlalchemy.org/en/14/orm/extensions/automap.html AutoBase.prepare(engine=self.engine, schema=schema, reflect=True) # if tables need to be dropped, use SQLAlchemy to drop them if deletion: # delete_tables = list(x.__table__ for x in AutoBase.classes if x.__table__.name in to_delete) self.delete_tables(schema, to_delete) AutoBase.metadata.clear() # if tables need to be (re)-created, create them from the connector's manifest definition if creation: self.create_models(schema, to_create) AutoBase.metadata.clear() returnmsg = "Successfully applied changes to the DB." logger.info(returnmsg) result['status'] = 'success' except Exception as e: returnmsg = "DB CHANGE: Error {}".format(e) logger.error(returnmsg) result['status'] = 'error' else: returnmsg = "DB CHANGE: Nothing to change in the current plan. No action will be applied on the db." logger.info(returnmsg) result['status'] = 'not applied' result['message'] = returnmsg result['plan'] = plan return result
def main(params: dict) -> dict: returnStr = "" try: # params = orc_input['params'] source, last_days, models = format_params(params) trigger = params['trigger'] results = {} azconn = AzureSQLConnector.load_default() initStr = "Fetch operation started. Trigger: {} Source: {} - Models: {} - LAST_DAYS={}".format( trigger, source, models, last_days) logger.info(initStr) client = get_client(source) for model_name in models: logger.info('Extracting data from Model: {}'.format(model_name)) jsonpath, dataset = client.get_data(model_name, last_days=last_days) # push to Azure SQL result = azconn.insert_dataset(dataset) results[model_name] = result returnStr = "Fetch operation ended. Trigger: {} - Source: {} - LAST_DAYS={}\nRESULTS: {}".format( trigger, source, last_days, results) logger.info(returnStr) output_results = {'params': params, 'results': results} except Exception as e: returnStr = 'F_fetch_data.fetch_data :: {}'.format( traceback.print_exc()) logger.error(e) output_results = {'params': params, 'results': returnStr} return output_results
def expand(): result = extend_data.main(params) logger.info(result)
def compare_schema(self, schema): """Loads all table definitions from the db schema, and compares it with the connector's model definitions taken from the connector's YAML manifest. Returns several sets of strings: - new_models: Model names that were given in the connector's manifest but not found in the database - deleted_models: Model names that were found in the database, but absent from the connector's current manifest - intersect_models: Model names that were found both in the database and in the connector's manifest - model_changes: dict object reflecting if some intersecting models were changed i.e., if fields were added or deleted from the manifest, compared to the current database state) - new_fields: list of field names that were found in the model manifest but not in the corresponding database table - deleted_fields: list of field names present in the db table but absent from the model manifest """ connector = import_module(CONNECTOR_MAP[schema]) logger.info("Comparing DB schema {} with connector models: {}".format( schema, connector.__name__)) # for documentation on this : refer to https://docs.sqlalchemy.org/en/14/orm/extensions/automap.html # AutoBase = automap_base() AutoBase.prepare(engine=self.engine, schema=schema, reflect=True) table_names = set(x.__table__.name for x in AutoBase.classes) model_names = set(connector.MODELS_LIST) new_models = model_names - table_names deleted_models = table_names - model_names intersect_models = table_names & model_names changed_models = set() logger.info("NEW models: {}".format(new_models)) logger.info("DELETED models: {}".format(deleted_models)) logger.info("Intersecting models: {}".format(intersect_models)) model_changes = {} for model_name in intersect_models: # logger.debug("Comparing Model: {}".format(model_name)) model = connector.MODELS[model_name] table_obj = getattr(AutoBase.classes, model_name) table_fields = set(x.name for x in table_obj.__table__.columns) table_field_objects = set(x for x in table_obj.__table__.columns) table_field_dict = {} for field in table_field_objects: table_field_dict[field.name] = {'dbname'} model_fields = get_all_model_fields(connector, model_name) new_fields = model_fields - table_fields deleted_fields = table_fields - model_fields intersect_fields = table_fields & model_fields has_changed = len(new_fields) > 0 or len(deleted_fields) > 0 if has_changed: changed_models.add(model_name) # logger.debug("HAS CHANGED: {}".format(has_changed)) # logger.debug("NEW Fields: {}".format(new_fields)) # logger.debug("DELETED Fields: {}".format(deleted_fields)) # logger.debug("MATCHING Fields: {}".format(intersect_fields)) model_changes[model_name] = { 'has_changed': has_changed, 'new_fields': list(new_fields), 'deleted_fields': list(deleted_fields), 'intersect_fields': list(intersect_fields) } logger.info("Models Comparison: {}".format(model_changes)) return new_models, deleted_models, intersect_models, changed_models, model_changes
def update_from_json(self, dataset): header = dataset['header'] schema = header['schema'] model_name = header['model'] result = None logger.info("Loading DB schema: {}".format(schema)) # for documentation on this : refer to https://docs.sqlalchemy.org/en/14/orm/extensions/automap.html AutoBase = automap_base() AutoBase.prepare(engine=self.engine, schema=schema, reflect=True) logger.debug("loading modelObject") modelObject = getattr(AutoBase.classes, model_name) logger.debug("Opening Session") session = self.SessionFactory() # This is very important, so the data is inserted in the right schema session.connection( execution_options={"schema_translate_map": { schema: schema }}) logger.info("Saving JSON file to {}".format(self.dbname)) logger.debug("JSON Header: {}".format(header)) try: for dict_item in dataset['data']: id = dict_item['Id'] objectInstance = session.query(modelObject).filter( modelObject.Id == id).first() # if object not found in the db, create it if objectInstance is None: logger.debug( "Object {} with ID={} not found in DB. Creating.". format(model_name, id)) objectInstance = modelObject(**dict_item) session.add(objectInstance) # if already present, update all its fields else: logger.debug( "Object {} with ID={} found in DB. Updating.".format( model_name, id)) id = dict_item.pop('Id') for key, value in dict_item.items(): setattr(objectInstance, key, value) logger.debug("inserted record {}".format(dict_item.values())) logger.info("Committing...") session.commit() result = 'committed' except Exception as e: logger.error("SQL connector update_from_json: {}".format(e)) session.rollback() result = 'rolled back' finally: session.close() return result