Ejemplo n.º 1
0
    def run_load_steps(self) -> None:
        download_queue = Queue()  # Queue for jobs which need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_number = 0
        for fiscal_year in self.config["fiscal_years"]:
            job_number += 1
            index = self.config["index_name"]
            filename = str(
                self.config["directory"] / "{fy}_{type}.csv".format(fy=fiscal_year, type=self.config["load_type"])
            )

            new_job = DataJob(job_number, index, fiscal_year, filename)

            if Path(filename).exists():
                Path(filename).unlink()
            download_queue.put(new_job)

        printf({"msg": "There are {} jobs to process".format(job_number)})

        process_list = [
            Process(
                name="Download Process",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ),
            Process(
                name="ES Index Process",
                target=es_data_loader,
                args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config),
            ),
        ]

        process_list[0].start()  # Start Download process

        if self.config["process_deletes"]:
            process_list.append(
                Process(
                    name="S3 Deleted Records Scrapper Process",
                    target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards,
                    args=(self.elasticsearch_client, self.config),
                )
            )
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit("Fatal error: review logs to determine why process died.")
            elif all([not x.is_alive() for x in process_list]):
                printf({"msg": "All ETL processes completed execution with no error codes"})
                break
Ejemplo n.º 2
0
    def run_load_steps(self) -> None:
        download_queue = Queue()  # Queue for jobs which need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        updated_record_count = get_updated_record_count(self.config)
        printf({"msg": f"Found {updated_record_count:,} {self.config['load_type']} records to index"})

        if updated_record_count == 0:
            jobs = 0
        else:
            download_queue, jobs = self.create_download_jobs()

        printf({"msg": f"There are {jobs} jobs to process"})

        process_list = [
            Process(
                name="Download Process",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ),
            Process(
                name="ES Index Process",
                target=es_data_loader,
                args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config),
            ),
        ]

        if updated_record_count != 0:  # only run if there are data to process
            process_list[0].start()  # Start Download process

        if self.config["process_deletes"]:
            process_list.append(
                Process(
                    name="S3 Deleted Records Scrapper Process",
                    target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards,
                    args=(self.elasticsearch_client, self.config),
                )
            )
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"})
                sleep(7)  # add a brief pause to make sure the deletes are processed in ES

        if updated_record_count != 0:
            process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit("Fatal error: review logs to determine why process died.")
            elif all([not x.is_alive() for x in process_list]):
                printf({"msg": "All ETL processes completed execution with no error codes"})
                break
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_number = 0
        for fy in self.config["fiscal_years"]:
            job_number += 1
            index = self.config["index_name"]
            filename = "{dir}{fy}_transactions.csv".format(
                dir=self.config["directory"], fy=fy)

            new_job = DataJob(job_number, index, fy, filename)

            if os.path.exists(filename):
                os.remove(filename)
            download_queue.put(new_job)

        printf({"msg": "There are {} jobs to process".format(job_number)})

        process_list = []
        process_list.append(
            Process(
                name="Download Proccess",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ))
        process_list.append(
            Process(name="ES Index Process",
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config["provide_deleted"]:
            process_list.append(
                Process(name="S3 Deleted Records Scrapper Process",
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    "msg":
                    "Waiting to start ES ingest until S3 deletes are complete"
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    "msg":
                    "All ETL processes completed execution with no error codes"
                })
                break

        if self.config["reload_all"]:
            printf({"msg": "Closing old indices and adding aliases"})
            swap_aliases(ES, self.config["index_name"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(ES, self.config["index_name"],
                          settings.ES_REPOSITORY)
Ejemplo n.º 4
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename,
                                                               has_header=True,
                                                               safe=False)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(
            Process(name='Download Proccess',
                    target=download_db_records,
                    args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(
            Process(name='ES Index Process',
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(
                Process(name='S3 Deleted Records Scrapper Process',
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    'msg':
                    'All ETL processes completed execution with no error codes'
                })
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'],
                          settings.ES_REPOSITORY)
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False)
                        printf({
                            'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count),
                            'job': new_job.name,
                            'f': 'Download'})
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(Process(
            name='Download Proccess',
            target=download_db_records,
            args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(Process(
            name='ES Index Process',
            target=es_data_loader,
            args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(Process(
                name='S3 Deleted Records Scrapper Process',
                target=deleted_transactions,
                args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({'msg': 'All ETL processes completed execution with no error codes'})
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)