def complete_process(self) -> None: if self.config["create_new_index"]: printf({"msg": "Closing old indices and adding aliases"}) set_final_index_config(self.elasticsearch_client, self.config["index_name"]) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: msg = "Storing datetime {} for next incremental load" printf({"msg": msg.format(self.config["processing_start_datetime"])}) update_last_load_date("es_transactions", self.config["processing_start_datetime"])
def complete_process(self) -> None: if self.config["create_new_index"]: set_final_index_config(self.elasticsearch_client, self.config["index_name"]) if self.config["skip_delete_index"]: printf({"msg": "Skipping deletion of old indices"}) else: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY) if self.config["is_incremental_load"]: toggle_refresh_on(self.elasticsearch_client, self.config["index_name"]) printf({"msg": f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"}) update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_number = 0 for fy in self.config["fiscal_years"]: job_number += 1 index = self.config["index_name"] filename = "{dir}{fy}_transactions.csv".format( dir=self.config["directory"], fy=fy) new_job = DataJob(job_number, index, fy, filename) if os.path.exists(filename): os.remove(filename) download_queue.put(new_job) printf({"msg": "There are {} jobs to process".format(job_number)}) process_list = [] process_list.append( Process( name="Download Proccess", target=download_db_records, args=(download_queue, es_ingest_queue, self.config), )) process_list.append( Process(name="ES Index Process", target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config["provide_deleted"]: process_list.append( Process(name="S3 Deleted Records Scrapper Process", target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ "msg": "Waiting to start ES ingest until S3 deletes are complete" }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ "msg": "All ETL processes completed execution with no error codes" }) break if self.config["reload_all"]: printf({"msg": "Closing old indices and adding aliases"}) swap_aliases(ES, self.config["index_name"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) take_snapshot(ES, self.config["index_name"], settings.ES_REPOSITORY)
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue( 20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format( filename, new_job.count), 'job': new_job.name, 'f': 'Download' }) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append( Process(name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append( Process(name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append( Process(name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({ 'msg': 'Waiting to start ES ingest until S3 deletes are complete' }) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({ 'msg': 'All ETL processes completed execution with no error codes' }) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)
def controller(self): download_queue = Queue() # Queue for jobs whch need a csv downloaded es_ingest_queue = Queue(20) # Queue for jobs which have a csv and are ready for ES ingest job_id = 0 for fy in self.config['fiscal_years']: for awd_cat_idx in AWARD_DESC_CATEGORIES.keys(): job_id += 1 index = self.config['index_name'] filename = '{dir}{fy}_transactions_{type}.csv'.format( dir=self.config['directory'], fy=fy, type=awd_cat_idx.replace(' ', '')) new_job = DataJob(job_id, index, fy, awd_cat_idx, filename) if os.path.exists(filename): # This is mostly for testing. If previous CSVs still exist skip the download for that file if self.config['stale']: new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False) printf({ 'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count), 'job': new_job.name, 'f': 'Download'}) # Add job directly to the Elasticsearch ingest queue since the CSV exists es_ingest_queue.put(new_job) continue else: os.remove(filename) download_queue.put(new_job) printf({'msg': 'There are {} jobs to process'.format(job_id)}) process_list = [] process_list.append(Process( name='Download Proccess', target=download_db_records, args=(download_queue, es_ingest_queue, self.config))) process_list.append(Process( name='ES Index Process', target=es_data_loader, args=(ES, download_queue, es_ingest_queue, self.config))) process_list[0].start() # Start Download process if self.config['provide_deleted']: process_list.append(Process( name='S3 Deleted Records Scrapper Process', target=deleted_transactions, args=(ES, self.config))) process_list[-1].start() # start S3 csv fetch proces while process_list[-1].is_alive(): printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'}) sleep(7) process_list[1].start() # start ES ingest process while True: sleep(10) if process_guarddog(process_list): raise SystemExit(1) elif all([not x.is_alive() for x in process_list]): printf({'msg': 'All ETL processes completed execution with no error codes'}) break if self.config['swap']: printf({'msg': 'Closing old indices and adding aliases'}) swap_aliases(ES, self.config['index_name']) if self.config['snapshot']: printf({'msg': 'Taking snapshot'}) take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)