Exemple #1
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            printf({"msg": "Closing old indices and adding aliases"})
            set_final_index_config(self.elasticsearch_client, self.config["index_name"])
            swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            msg = "Storing datetime {} for next incremental load"
            printf({"msg": msg.format(self.config["processing_start_datetime"])})
            update_last_load_date("es_transactions", self.config["processing_start_datetime"])
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            set_final_index_config(self.elasticsearch_client, self.config["index_name"])
            if self.config["skip_delete_index"]:
                printf({"msg": "Skipping deletion of old indices"})
            else:
                printf({"msg": "Closing old indices and adding aliases"})
                swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(self.elasticsearch_client, self.config["index_name"])
            printf({"msg": f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"})
            update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_number = 0
        for fy in self.config["fiscal_years"]:
            job_number += 1
            index = self.config["index_name"]
            filename = "{dir}{fy}_transactions.csv".format(
                dir=self.config["directory"], fy=fy)

            new_job = DataJob(job_number, index, fy, filename)

            if os.path.exists(filename):
                os.remove(filename)
            download_queue.put(new_job)

        printf({"msg": "There are {} jobs to process".format(job_number)})

        process_list = []
        process_list.append(
            Process(
                name="Download Proccess",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ))
        process_list.append(
            Process(name="ES Index Process",
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config["provide_deleted"]:
            process_list.append(
                Process(name="S3 Deleted Records Scrapper Process",
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    "msg":
                    "Waiting to start ES ingest until S3 deletes are complete"
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    "msg":
                    "All ETL processes completed execution with no error codes"
                })
                break

        if self.config["reload_all"]:
            printf({"msg": "Closing old indices and adding aliases"})
            swap_aliases(ES, self.config["index_name"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(ES, self.config["index_name"],
                          settings.ES_REPOSITORY)
Exemple #4
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename,
                                                               has_header=True,
                                                               safe=False)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(
            Process(name='Download Proccess',
                    target=download_db_records,
                    args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(
            Process(name='ES Index Process',
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(
                Process(name='S3 Deleted Records Scrapper Process',
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    'msg':
                    'All ETL processes completed execution with no error codes'
                })
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'],
                          settings.ES_REPOSITORY)
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False)
                        printf({
                            'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count),
                            'job': new_job.name,
                            'f': 'Download'})
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(Process(
            name='Download Proccess',
            target=download_db_records,
            args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(Process(
            name='ES Index Process',
            target=es_data_loader,
            args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(Process(
                name='S3 Deleted Records Scrapper Process',
                target=deleted_transactions,
                args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({'msg': 'All ETL processes completed execution with no error codes'})
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)