Ejemplo n.º 1
0
 def exit(self, message, isThread=False):
     message = "\n<<<<<ERROR!>>>>>\n\n" + message
     Log.Instance().appendFinalReport(message)
     Log.Instance().save(success=False)
     # if Config.SEND_REPORT:
     # MailReport.Instance().run(message, " - FAILED")
     os._exit(1)
Ejemplo n.º 2
0
		def _show_message(self, arrayjson_kpis):
			kpis = []
			for kpi in arrayjson_kpis:
				kpi_name = kpi['etl_meta']['label']
				if kpi_name not in kpis:
					kpis.append(kpi_name)
			Log.Instance().appendFinalReport("\nKPIs to be loaded: " + ", ".join(kpis))
			Log.Instance().appendFinalReport("\nDates to be loaded: " + ', '.join(self.daterange))
Ejemplo n.º 3
0
 def _process_raw_results(self, results):
     Log.Instance().appendFinalReport(
         "\n++++++ PROCESS RESULTS STAGE ++++++\n")
     final_results = []
     while len(results) > 0:
         for result in results:
             query_result = result.get()
             results.remove(result)
             final_results += query_result.to_json_collections()
             if Config.VERBOSE_MODE:
                 Log.Instance().append(query_result.generate_message())
     return final_results
Ejemplo n.º 4
0
		def post_kpi(self, path, data):
			# user = Config.TOPN_API['auth']['user']
			# psw = Config.TOPN_API['auth']['pwd']
			data = {
				"data": data
			}
			Log.Instance().appendFinalReport("POST: " + path)
			headers = {'content-type': 'application/json'}
			if data:
				r = requests.post(path, data=json.dumps(data), headers=headers)
				Log.Instance().appendFinalReport("Result (" + str(r.status_code) + "): " + str(r.content) + "\n")
				if r.status_code != 200:
					raise RequestException
Ejemplo n.º 5
0
 def run(self):
     Log.Instance().appendFinalReport(
         "\nStarting EXTRACT/TRANSFORM stage...\n================")
     jobs_to_run = self._generate_jobs_to_run()
     if not Config.SKIP_TO_RESULTS:
         execution_graph = Graph(jobs_to_run)
         jobs_to_get_results = self._execute_queries(
             jobs_to_run, execution_graph)
     else:
         jobs_to_get_results = self._generate_jobs_to_get_results(
             jobs_to_run)
     final_results = self._get_queries_results(jobs_to_get_results)
     Log.Instance().appendFinalReport(
         "================\nEXTRACT/TRANSFORM stage ended.")
     return final_results
Ejemplo n.º 6
0
 def execute(self, method, retries):
     test = False
     cont = 1
     while test is False:
         test = True
         try:
             return getattr(self, method)()
         except Exception as e:
             Log.Instance().appendFinalReport("THREAD EXCEPTION at " +
                                              method + ", " + self.name +
                                              ": " + str(e) + ", type: " +
                                              str(type(e)) +
                                              ", RETRYING (" +
                                              str(retries) +
                                              " times), retry: " +
                                              str(cont))
             test = False
             cont += 1
             if cont > retries:
                 message = "<<< Tried " + str(retries) +\
                           " times and failed, ABORTING THREAD! >>>\n\n"
                 message += self.name + " - " + str(e) + ", type: " + str(
                     type(e))
                 message += JobsExecutionInfo.Instance().report()
                 SystemExiter.Instance().exit(message, isThread=True)
             else:
                 time.sleep(60)
Ejemplo n.º 7
0
 def _show_config_message(self):
     Log.Instance().appendFinalReport(self._configuration_message(self.args))
     if not self.args.force_config:
         print('#  Start ETL with this configuration? (y/n)')
         option = sys.stdin.readline().strip().lower()
         if option not in ['y', 'yes']:
             raise SystemExit
Ejemplo n.º 8
0
 def run(self, collections):
     Log.Instance().appendFinalReport(
         "\nStarting POSPROCESSING stage...\n===================")
     for collection in collections:
         Log.Instance().append("Posprocessing " +
                               collection['etl_meta']['label'] + "...")
         lentemp = len(collection['etl_data'])
         collection['etl_data'] = self._process_to_api_final(
             collection['etl_data'],
             kpi_name=collection['etl_meta']['kpi_name'])
         result_log = "( " + collection['etl_meta']['label'] + ":" + str(
             lentemp)
         result_log += " = " + collection['etl_meta']['label'] +\
                       ":" + str(len(collection['etl_data'])) + " )"
         Log.Instance().append(result_log)
     Log.Instance().appendFinalReport(
         "===================\nPOSPROCESSING stage ended.")
     return collections
Ejemplo n.º 9
0
 def insert(self):
     Log.Instance().appendFinalReport("Executing: " + self.table_name +
                                      "  ")
     start_time = time.time()
     if Config.VERBOSE_MODE:
         Log.Instance().append("\nExecuting:\n" +
                               self._get_insert_statement())
         Log.Instance().append("\n---------------\n")
     MysqlConnection.Instance().execute(self._get_remove_statement(),
                                        self.table_name)
     MysqlConnection.Instance().execute(self._get_insert_statement(),
                                        self.table_name)
     elapsed_seconds = time.time() - start_time
     self.execution_time = int(elapsed_seconds / float(60) * 100) / 100.0
     self.executed = True
     Log.Instance().appendFinalReport("...finished executing: " +
                                      self.table_name + " (" +
                                      str(self.execution_time) + " mins)  ")
Ejemplo n.º 10
0
		def run(self):
			Log.Instance().appendFinalReport("Starting LOAD stage...\n===================")
			arrayjson_kpis = self._load_json_files()
			self._show_message(arrayjson_kpis)
			for kpi in arrayjson_kpis:
				if kpi['etl_meta']['api'] == 'ambev-mip':
					for item in kpi['etl_data']:
						path = '{host}/feedstock-results/{kpi_name}/{date}'\
								.format(host=Config.CURRENT_ENV['api'],
												kpi_name=kpi['etl_meta']['kpi_name'],
												date=kpi['etl_meta']['timestamp'])
						self.post_kpi(path, item)
				else:
					for item in kpi['etl_data']:
						path = '{host}/content-results/{kpi_name}/{date}'\
								.format(host=Config.CURRENT_ENV['api'],
												kpi_name=kpi['etl_meta']['kpi_name'],
												date=kpi['etl_meta']['timestamp'])
						self.post_kpi(path, item)
			Log.Instance().appendFinalReport("===================\nLOAD stage ended.")
Ejemplo n.º 11
0
 def _get_raw_results(self, thread_pool, jobs_to_get_results):
     Log.Instance().appendFinalReport("\n++++++ GET RESULTS STAGE ++++++")
     results = []
     while len(jobs_to_get_results) > 0:
         for job in jobs_to_get_results:
             results.append(
                 thread_pool.apply_async(job.execute, ("results", 100)))
             jobs_to_get_results.remove(job)
     for result in results:
         result.wait()
     return results
Ejemplo n.º 12
0
 def run(self, collections):
     Log.Instance().appendFinalReport(
         "\nStarting WRITING stage...\n===================")
     for collection in collections:
         Log.Instance().append("Writing " +
                               collection['etl_meta']['label'] + " for " +
                               collection['etl_meta']['timestamp'] + "...")
         if collection['etl_meta']['is_kpi']:
             filepath = Config.WORKDIRECTORY_FOR_KPIS
         else:
             filepath = Config.WORKDIRECTORY_FOR_TEMPS
         filepath = filepath.format(
             date=collection['etl_meta']['timestamp'][0:10])
         FileManager.create_if_dont_exist(filepath)
         print(collection)
         FileManager.write_json_to_file(filepath,
                                        collection['etl_meta']['label'],
                                        collection)
     Log.Instance().appendFinalReport(
         "===================\nWRITING stage ended.")
Ejemplo n.º 13
0
 def _execute_transform(self):
     execution_info = ETLExecutionInfo("TRANSFORM")
     loaded_jobs = JobsLoader.Instance().loaded_jobs
     job_manager = JobsManager(loaded_jobs)
     results = job_manager.run()
     results = PosProcessor.Instance().run(results)
     Writer.Instance().run(results)
     execution_info.end()
     Log.Instance().appendFinalReport(
         "[TRANSFORM executed in: " +
         str(execution_info.execution_data['value']) + " minutes ]")
     return execution_info.execution_data
Ejemplo n.º 14
0
 def results(self):
     Log.Instance().appendFinalReport("Getting result: " +
                                      self._table_name() + "  ")
     result_statement = self.SHOW_RESULT_STATEMENT_TEMPLATE.format(
         table_name=self._table_name())
     schema, data = MysqlConnection.Instance().execute(result_statement,
                                                       self._table_name(),
                                                       return_result=True)
     schema = [x.replace(self._table_name() + ".", "") for x in schema]
     Log.Instance().appendFinalReport("...finished getting result: " +
                                      self._table_name() + "  ")
     return OutputFormatter(
         self.name,
         self.kpi_name,
         self.api,
         self.execution_time,
         schema,
         data,
         self.is_kpi(),
         self.datatype,
     )
Ejemplo n.º 15
0
    def _execute(self):
        etl_execution_info = ETLExecutionInfo("ETL")

        transform_execution_data = None
        if self.args.transform:
            transform_execution_data = self._execute_transform()

        load_execution_data = None
        if self.args.load:
            load_execution_data = self._execute_load()
        etl_execution_info.end()
        Log.Instance().appendFinalReport(
            "[ETL executed in: " +
            str(etl_execution_info.execution_data['value']) + " minutes ]")
Ejemplo n.º 16
0
 def _execute_load(self):
     execution_info = ETLExecutionInfo("LOAD")
     test = False
     cont = 1
     while test is False:
         test = True
         try:
             Loader.Instance().run()
         except RequestException:
             if cont == 21:
                 SystemExiter.Instance().exit(
                     "<<< Tried 20 times to LOAD and failed, ABORTING! >>>")
             Log.Instance().appendFinalReport(
                 "<<< EXCEPTION RequestException, RETRY! (" + str(cont) +
                 " time)>>>")
             test = False
             cont += 1
             time.sleep(60)
     execution_info.end()
     Log.Instance().appendFinalReport(
         "[LOAD executed in: " +
         str(execution_info.execution_data['value']) + " minutes ]")
     return execution_info.execution_data
Ejemplo n.º 17
0
 def _create_graph(self):
     Log.Instance().appendFinalReport("Generating graph for jobs " +
                                      str(self.jobs_to_run))
     dag = digraph()
     for job in self.jobs_to_run:
         self._add_node(dag,
                        job.name,
                        attrs=[("shape", "none"),
                               ("label", self._tabulated_label(job.name))])
         if len(job.previous_jobs) > 0:
             self._add_edge(dag, job.name, job.previous_jobs)
     for final_node in [
             node for node in dag.nodes() if not dag.neighbors(node)
     ]:
         self._set_node_attr(dag, final_node, [("color", "green")])
     for start_node in [
             node for node in dag.nodes() if not dag.incidents(node)
     ]:
         self._set_node_attr(dag, start_node, [("color", "blue")])
     self.graph = dag
     Log.Instance().appendFinalReport(
         "Execution graph generated, a PDF was saved at " + self.filename +
         "\n")
Ejemplo n.º 18
0
 def _execute_queries(self, jobs_to_run, graph):
     Log.Instance().appendFinalReport("++++++ RUNNING STAGE ++++++")
     pool_execution = ThreadPool(Config.THREAD_POOL)
     JobsExecutionInfo.Instance().initialize(jobs_to_run, self.loaded_jobs)
     jobs_to_get_results = []
     jobs_to_prometheus = []
     while len(jobs_to_run) > 0:
         for job in jobs_to_run:
             if self._job_is_ready(job, graph) and not job.executed:
                 pool_execution.apply_async(
                     job.execute, (job.action, Config.JOB_RETRIES))
                 jobs_to_run.remove(job)
                 jobs_to_prometheus.append(job)
                 if job.is_kpi() or\
                   (Config.DEBUG_MODE and job.name not in self.jobs_dont_get_results):
                     jobs_to_get_results.append(job)
             time.sleep(5)
     pool_execution.close()
     pool_execution.join()
     return jobs_to_get_results
Ejemplo n.º 19
0
 def run(self, message, title_label=""):
     print("\nSending mail report...")
     full_message = self._generate_message(message)
     for dest in Config.SEND_REPORT_TO:
         test = False
         cont = 1
         while test is False:
             test = True
             try:
                 self._send_mail(full_message, dest, title_label)
             except SMTPException:
                 if cont == 21:
                     raise Exception(
                         "<<< Tried 20 times to SEND MAIL REPORT and failed, ABORTING! >>>"
                     )
                 Log.Instance().appendFinalReport(
                     "<<< EXCEPTION SMTPException, RETRY! (" + str(cont) +
                     " time)>>>")
                 test = False
                 cont += 1
                 time.sleep(60)
     print("Mail report sent.")
Ejemplo n.º 20
0
 def _add_date_field(self, collection):
     results = []
     count_errors = 0
     for item in collection:
         if 'created_at' in item:
             date = item['created_at']
             item['created_at'] = date.strftime("%Y-%m-%d")
         else:
             try:
                 item_date = datetime(int(item['year']), int(item['month']),
                                      int(item['day']))
                 item['created_at'] = item_date.strftime("%Y-%m-%d")
             except KeyError:
                 item['created_at'] = "temp"
             except TypeError:
                 count_errors += 1
                 continue
             except ValueError:
                 count_errors += 1
                 continue
         results.append(item)
     Log.Instance().appendFinalReport(
         "Error parsing created_at field. Count: %s" % (count_errors))
     return results
Ejemplo n.º 21
0
 def _validate_jobs(self):
     Log.Instance().appendFinalReport('\nValidating kpis/jobs...\n')
     for job in list(self.loaded_jobs.keys()):
         if self.loaded_jobs[job].action == 'insert' and self.loaded_jobs[
                 job].table_name is None:
             SystemExiter.Instance().exit(
                 'Error: ' + job + ' needs table name to insert data')
     if len(Config.JOBS_NAMES) == 0:
         all_final_jobs = [
             job_name for job_name in list(self.loaded_jobs.keys())
             if self.loaded_jobs[job_name].is_kpi()
         ]
         if Config.RUN_JOBS:
             Config.JOBS_NAMES += [job_name for job_name in all_final_jobs]
     for job_name in Config.JOBS_NAMES:
         if job_name not in self.loaded_jobs:
             if Config.RUN_JOBS:
                 SystemExiter.Instance().exit(
                     'Error: ' + job_name +
                     ' not found in jobs definitions')
             else:
                 SystemExiter.Instance().exit(
                     'Error: ' + job_name +
                     ' not found in jobs definitions ')