def exit(self, message, isThread=False): message = "\n<<<<<ERROR!>>>>>\n\n" + message Log.Instance().appendFinalReport(message) Log.Instance().save(success=False) # if Config.SEND_REPORT: # MailReport.Instance().run(message, " - FAILED") os._exit(1)
def _show_message(self, arrayjson_kpis): kpis = [] for kpi in arrayjson_kpis: kpi_name = kpi['etl_meta']['label'] if kpi_name not in kpis: kpis.append(kpi_name) Log.Instance().appendFinalReport("\nKPIs to be loaded: " + ", ".join(kpis)) Log.Instance().appendFinalReport("\nDates to be loaded: " + ', '.join(self.daterange))
def _process_raw_results(self, results): Log.Instance().appendFinalReport( "\n++++++ PROCESS RESULTS STAGE ++++++\n") final_results = [] while len(results) > 0: for result in results: query_result = result.get() results.remove(result) final_results += query_result.to_json_collections() if Config.VERBOSE_MODE: Log.Instance().append(query_result.generate_message()) return final_results
def post_kpi(self, path, data): # user = Config.TOPN_API['auth']['user'] # psw = Config.TOPN_API['auth']['pwd'] data = { "data": data } Log.Instance().appendFinalReport("POST: " + path) headers = {'content-type': 'application/json'} if data: r = requests.post(path, data=json.dumps(data), headers=headers) Log.Instance().appendFinalReport("Result (" + str(r.status_code) + "): " + str(r.content) + "\n") if r.status_code != 200: raise RequestException
def run(self): Log.Instance().appendFinalReport( "\nStarting EXTRACT/TRANSFORM stage...\n================") jobs_to_run = self._generate_jobs_to_run() if not Config.SKIP_TO_RESULTS: execution_graph = Graph(jobs_to_run) jobs_to_get_results = self._execute_queries( jobs_to_run, execution_graph) else: jobs_to_get_results = self._generate_jobs_to_get_results( jobs_to_run) final_results = self._get_queries_results(jobs_to_get_results) Log.Instance().appendFinalReport( "================\nEXTRACT/TRANSFORM stage ended.") return final_results
def execute(self, method, retries): test = False cont = 1 while test is False: test = True try: return getattr(self, method)() except Exception as e: Log.Instance().appendFinalReport("THREAD EXCEPTION at " + method + ", " + self.name + ": " + str(e) + ", type: " + str(type(e)) + ", RETRYING (" + str(retries) + " times), retry: " + str(cont)) test = False cont += 1 if cont > retries: message = "<<< Tried " + str(retries) +\ " times and failed, ABORTING THREAD! >>>\n\n" message += self.name + " - " + str(e) + ", type: " + str( type(e)) message += JobsExecutionInfo.Instance().report() SystemExiter.Instance().exit(message, isThread=True) else: time.sleep(60)
def _show_config_message(self): Log.Instance().appendFinalReport(self._configuration_message(self.args)) if not self.args.force_config: print('# Start ETL with this configuration? (y/n)') option = sys.stdin.readline().strip().lower() if option not in ['y', 'yes']: raise SystemExit
def run(self, collections): Log.Instance().appendFinalReport( "\nStarting POSPROCESSING stage...\n===================") for collection in collections: Log.Instance().append("Posprocessing " + collection['etl_meta']['label'] + "...") lentemp = len(collection['etl_data']) collection['etl_data'] = self._process_to_api_final( collection['etl_data'], kpi_name=collection['etl_meta']['kpi_name']) result_log = "( " + collection['etl_meta']['label'] + ":" + str( lentemp) result_log += " = " + collection['etl_meta']['label'] +\ ":" + str(len(collection['etl_data'])) + " )" Log.Instance().append(result_log) Log.Instance().appendFinalReport( "===================\nPOSPROCESSING stage ended.") return collections
def insert(self): Log.Instance().appendFinalReport("Executing: " + self.table_name + " ") start_time = time.time() if Config.VERBOSE_MODE: Log.Instance().append("\nExecuting:\n" + self._get_insert_statement()) Log.Instance().append("\n---------------\n") MysqlConnection.Instance().execute(self._get_remove_statement(), self.table_name) MysqlConnection.Instance().execute(self._get_insert_statement(), self.table_name) elapsed_seconds = time.time() - start_time self.execution_time = int(elapsed_seconds / float(60) * 100) / 100.0 self.executed = True Log.Instance().appendFinalReport("...finished executing: " + self.table_name + " (" + str(self.execution_time) + " mins) ")
def run(self): Log.Instance().appendFinalReport("Starting LOAD stage...\n===================") arrayjson_kpis = self._load_json_files() self._show_message(arrayjson_kpis) for kpi in arrayjson_kpis: if kpi['etl_meta']['api'] == 'ambev-mip': for item in kpi['etl_data']: path = '{host}/feedstock-results/{kpi_name}/{date}'\ .format(host=Config.CURRENT_ENV['api'], kpi_name=kpi['etl_meta']['kpi_name'], date=kpi['etl_meta']['timestamp']) self.post_kpi(path, item) else: for item in kpi['etl_data']: path = '{host}/content-results/{kpi_name}/{date}'\ .format(host=Config.CURRENT_ENV['api'], kpi_name=kpi['etl_meta']['kpi_name'], date=kpi['etl_meta']['timestamp']) self.post_kpi(path, item) Log.Instance().appendFinalReport("===================\nLOAD stage ended.")
def _get_raw_results(self, thread_pool, jobs_to_get_results): Log.Instance().appendFinalReport("\n++++++ GET RESULTS STAGE ++++++") results = [] while len(jobs_to_get_results) > 0: for job in jobs_to_get_results: results.append( thread_pool.apply_async(job.execute, ("results", 100))) jobs_to_get_results.remove(job) for result in results: result.wait() return results
def run(self, collections): Log.Instance().appendFinalReport( "\nStarting WRITING stage...\n===================") for collection in collections: Log.Instance().append("Writing " + collection['etl_meta']['label'] + " for " + collection['etl_meta']['timestamp'] + "...") if collection['etl_meta']['is_kpi']: filepath = Config.WORKDIRECTORY_FOR_KPIS else: filepath = Config.WORKDIRECTORY_FOR_TEMPS filepath = filepath.format( date=collection['etl_meta']['timestamp'][0:10]) FileManager.create_if_dont_exist(filepath) print(collection) FileManager.write_json_to_file(filepath, collection['etl_meta']['label'], collection) Log.Instance().appendFinalReport( "===================\nWRITING stage ended.")
def _execute_transform(self): execution_info = ETLExecutionInfo("TRANSFORM") loaded_jobs = JobsLoader.Instance().loaded_jobs job_manager = JobsManager(loaded_jobs) results = job_manager.run() results = PosProcessor.Instance().run(results) Writer.Instance().run(results) execution_info.end() Log.Instance().appendFinalReport( "[TRANSFORM executed in: " + str(execution_info.execution_data['value']) + " minutes ]") return execution_info.execution_data
def results(self): Log.Instance().appendFinalReport("Getting result: " + self._table_name() + " ") result_statement = self.SHOW_RESULT_STATEMENT_TEMPLATE.format( table_name=self._table_name()) schema, data = MysqlConnection.Instance().execute(result_statement, self._table_name(), return_result=True) schema = [x.replace(self._table_name() + ".", "") for x in schema] Log.Instance().appendFinalReport("...finished getting result: " + self._table_name() + " ") return OutputFormatter( self.name, self.kpi_name, self.api, self.execution_time, schema, data, self.is_kpi(), self.datatype, )
def _execute(self): etl_execution_info = ETLExecutionInfo("ETL") transform_execution_data = None if self.args.transform: transform_execution_data = self._execute_transform() load_execution_data = None if self.args.load: load_execution_data = self._execute_load() etl_execution_info.end() Log.Instance().appendFinalReport( "[ETL executed in: " + str(etl_execution_info.execution_data['value']) + " minutes ]")
def _execute_load(self): execution_info = ETLExecutionInfo("LOAD") test = False cont = 1 while test is False: test = True try: Loader.Instance().run() except RequestException: if cont == 21: SystemExiter.Instance().exit( "<<< Tried 20 times to LOAD and failed, ABORTING! >>>") Log.Instance().appendFinalReport( "<<< EXCEPTION RequestException, RETRY! (" + str(cont) + " time)>>>") test = False cont += 1 time.sleep(60) execution_info.end() Log.Instance().appendFinalReport( "[LOAD executed in: " + str(execution_info.execution_data['value']) + " minutes ]") return execution_info.execution_data
def _create_graph(self): Log.Instance().appendFinalReport("Generating graph for jobs " + str(self.jobs_to_run)) dag = digraph() for job in self.jobs_to_run: self._add_node(dag, job.name, attrs=[("shape", "none"), ("label", self._tabulated_label(job.name))]) if len(job.previous_jobs) > 0: self._add_edge(dag, job.name, job.previous_jobs) for final_node in [ node for node in dag.nodes() if not dag.neighbors(node) ]: self._set_node_attr(dag, final_node, [("color", "green")]) for start_node in [ node for node in dag.nodes() if not dag.incidents(node) ]: self._set_node_attr(dag, start_node, [("color", "blue")]) self.graph = dag Log.Instance().appendFinalReport( "Execution graph generated, a PDF was saved at " + self.filename + "\n")
def _execute_queries(self, jobs_to_run, graph): Log.Instance().appendFinalReport("++++++ RUNNING STAGE ++++++") pool_execution = ThreadPool(Config.THREAD_POOL) JobsExecutionInfo.Instance().initialize(jobs_to_run, self.loaded_jobs) jobs_to_get_results = [] jobs_to_prometheus = [] while len(jobs_to_run) > 0: for job in jobs_to_run: if self._job_is_ready(job, graph) and not job.executed: pool_execution.apply_async( job.execute, (job.action, Config.JOB_RETRIES)) jobs_to_run.remove(job) jobs_to_prometheus.append(job) if job.is_kpi() or\ (Config.DEBUG_MODE and job.name not in self.jobs_dont_get_results): jobs_to_get_results.append(job) time.sleep(5) pool_execution.close() pool_execution.join() return jobs_to_get_results
def run(self, message, title_label=""): print("\nSending mail report...") full_message = self._generate_message(message) for dest in Config.SEND_REPORT_TO: test = False cont = 1 while test is False: test = True try: self._send_mail(full_message, dest, title_label) except SMTPException: if cont == 21: raise Exception( "<<< Tried 20 times to SEND MAIL REPORT and failed, ABORTING! >>>" ) Log.Instance().appendFinalReport( "<<< EXCEPTION SMTPException, RETRY! (" + str(cont) + " time)>>>") test = False cont += 1 time.sleep(60) print("Mail report sent.")
def _add_date_field(self, collection): results = [] count_errors = 0 for item in collection: if 'created_at' in item: date = item['created_at'] item['created_at'] = date.strftime("%Y-%m-%d") else: try: item_date = datetime(int(item['year']), int(item['month']), int(item['day'])) item['created_at'] = item_date.strftime("%Y-%m-%d") except KeyError: item['created_at'] = "temp" except TypeError: count_errors += 1 continue except ValueError: count_errors += 1 continue results.append(item) Log.Instance().appendFinalReport( "Error parsing created_at field. Count: %s" % (count_errors)) return results
def _validate_jobs(self): Log.Instance().appendFinalReport('\nValidating kpis/jobs...\n') for job in list(self.loaded_jobs.keys()): if self.loaded_jobs[job].action == 'insert' and self.loaded_jobs[ job].table_name is None: SystemExiter.Instance().exit( 'Error: ' + job + ' needs table name to insert data') if len(Config.JOBS_NAMES) == 0: all_final_jobs = [ job_name for job_name in list(self.loaded_jobs.keys()) if self.loaded_jobs[job_name].is_kpi() ] if Config.RUN_JOBS: Config.JOBS_NAMES += [job_name for job_name in all_final_jobs] for job_name in Config.JOBS_NAMES: if job_name not in self.loaded_jobs: if Config.RUN_JOBS: SystemExiter.Instance().exit( 'Error: ' + job_name + ' not found in jobs definitions') else: SystemExiter.Instance().exit( 'Error: ' + job_name + ' not found in jobs definitions ')