Ejemplo n.º 1
0
    def handle(self, *args, **options):
        elasticsearch_client = instantiate_elasticsearch_client()
        config = process_cli_parameters(options, elasticsearch_client)

        start = perf_counter()
        printf({"msg": f"Starting script\n{'=' * 56}"})
        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**config)})

        if config["load_type"] == "transactions":
            ensure_view_exists(settings.ES_TRANSACTIONS_ETL_VIEW_NAME)
        elif config["load_type"] == "awards":
            ensure_view_exists(settings.ES_AWARDS_ETL_VIEW_NAME)

        loader = Rapidloader(config, elasticsearch_client)
        loader.run_load_steps()
        loader.complete_process()

        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({"msg": f"Script completed in {perf_counter() - start:.2f}s"})
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
    def handle(self, *args, **options):
        self.elasticsearch_client = instantiate_elasticsearch_client()
        self.config = process_cli_parameters(options,
                                             self.elasticsearch_client)

        start = perf_counter()
        printf({"msg": "Starting script\n{}".format("=" * 56)})
        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**self.config)})
        ensure_transaction_etl_view_exists()

        self.run_load_steps()
        self.complete_process()

        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({
            "msg":
            "Script completed in {} seconds".format(perf_counter() - start)
        })
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
    def handle(self, *args, **options):
        ''' Script execution of custom code starts in this method'''
        start = perf_counter()
        printf({'msg': 'Starting script\n{}'.format('=' * 56)})

        self.config = set_config()
        self.config['verbose'] = True if options['verbosity'] > 1 else False
        self.config['fiscal_years'] = options['fiscal_years']
        self.config['directory'] = options['dir'] + os.sep
        self.config['provide_deleted'] = options['deleted']
        self.config['recreate'] = options['recreate']
        self.config['stale'] = options['stale']
        self.config['keep'] = options['keep']

        if not options['since']:
            # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date
            #   before the earliest records in S3 and when Postgres records were updated.
            #   Choose the beginning of FY2008, and made it timezone-award for S3
            self.config['starting_date'] = datetime.strptime(
                '2007-10-01+0000', '%Y-%m-%d%z')
        else:
            if self.config['recreate']:
                print(
                    'Bad mix of parameters! An index should not be dropped if only a subset of data will be loaded'
                )
                raise SystemExit
            self.config['starting_date'] = datetime.strptime(
                options['since'] + '+0000', '%Y-%m-%d%z')

        if not os.path.isdir(self.config['directory']):
            printf({'msg': 'Provided directory does not exist'})
            raise SystemExit

        self.controller()
        printf({
            'msg':
            '---------------------------------------------------------------'
        })
        printf({
            'msg':
            'Script completed in {} seconds'.format(perf_counter() - start)
        })
        printf({
            'msg':
            '---------------------------------------------------------------'
        })
Ejemplo n.º 4
0
    def run_load_steps(self) -> None:
        download_queue = Queue()  # Queue for jobs which need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        updated_record_count = get_updated_record_count(self.config)
        printf({"msg": f"Found {updated_record_count:,} {self.config['load_type']} records to index"})

        if updated_record_count == 0:
            jobs = 0
        else:
            download_queue, jobs = self.create_download_jobs()

        printf({"msg": f"There are {jobs} jobs to process"})

        process_list = [
            Process(
                name="Download Process",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ),
            Process(
                name="ES Index Process",
                target=es_data_loader,
                args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config),
            ),
        ]

        if updated_record_count != 0:  # only run if there are data to process
            process_list[0].start()  # Start Download process

        if self.config["process_deletes"]:
            process_list.append(
                Process(
                    name="S3 Deleted Records Scrapper Process",
                    target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards,
                    args=(self.elasticsearch_client, self.config),
                )
            )
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"})
                sleep(7)  # add a brief pause to make sure the deletes are processed in ES

        if updated_record_count != 0:
            process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit("Fatal error: review logs to determine why process died.")
            elif all([not x.is_alive() for x in process_list]):
                printf({"msg": "All ETL processes completed execution with no error codes"})
                break
    def transform_cli_arguments(self, options):
        simple_args = ("provide_deleted", "reload_all", "snapshot",
                       "index_name", "directory", "fast")
        self.config = set_config(simple_args, options)

        self.config["fiscal_years"] = fiscal_years_for_processing(options)
        self.config["directory"] = self.config["directory"] + os.sep
        self.config["index_name"] = self.config["index_name"].lower()

        if self.config["reload_all"]:
            self.config["starting_date"] = DEFAULT_DATETIME
        elif options["start_datetime"]:
            self.config["starting_date"] = options["start_datetime"]
        else:
            # Due to the queries used for fetching postgres data,
            #  `starting_date` needs to be present and a date before:
            #      - The earliest records in S3.
            #      - When all transaction records in the USAspending SQL database were updated.
            #   And keep it timezone-award for S3
            self.config["starting_date"] = get_last_load_date(
                "es_transactions", default=DEFAULT_DATETIME)

        self.config["mapping"], self.config["doc_type"], self.config[
            "max_query_size"] = mapping_data_for_processing()

        does_index_exist = ES.indices.exists(self.config["index_name"])
        self.config["is_incremental_load"] = self.config[
            "starting_date"] != DEFAULT_DATETIME

        if not os.path.isdir(self.config["directory"]):
            printf({"msg": "Provided directory does not exist"})
            raise SystemExit(1)
        elif self.config["starting_date"] < DEFAULT_DATETIME:
            printf({
                "msg":
                "`start-datetime` is too early. Set to after {}".format(
                    DEFAULT_DATETIME)
            })
            raise SystemExit(1)
        elif does_index_exist and not self.config["is_incremental_load"]:
            printf({
                "msg":
                "Full data load into existing index! Change destination index or load a subset of data"
            })
            raise SystemExit(1)
        elif not does_index_exist or self.config["reload_all"]:
            printf({
                "msg":
                "Skipping deletions for ths load, provide_deleted overwritten to False"
            })
            self.config["provide_deleted"] = False
Ejemplo n.º 6
0
    def run_load_steps(self) -> None:
        download_queue = Queue()  # Queue for jobs which need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_number = 0
        for fiscal_year in self.config["fiscal_years"]:
            job_number += 1
            index = self.config["index_name"]
            filename = str(
                self.config["directory"] / "{fy}_{type}.csv".format(fy=fiscal_year, type=self.config["load_type"])
            )

            new_job = DataJob(job_number, index, fiscal_year, filename)

            if Path(filename).exists():
                Path(filename).unlink()
            download_queue.put(new_job)

        printf({"msg": "There are {} jobs to process".format(job_number)})

        process_list = [
            Process(
                name="Download Process",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ),
            Process(
                name="ES Index Process",
                target=es_data_loader,
                args=(self.elasticsearch_client, download_queue, es_ingest_queue, self.config),
            ),
        ]

        process_list[0].start()  # Start Download process

        if self.config["process_deletes"]:
            process_list.append(
                Process(
                    name="S3 Deleted Records Scrapper Process",
                    target=deleted_transactions if self.config["load_type"] == "transactions" else deleted_awards,
                    args=(self.elasticsearch_client, self.config),
                )
            )
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({"msg": "Waiting to start ES ingest until S3 deletes are complete"})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit("Fatal error: review logs to determine why process died.")
            elif all([not x.is_alive() for x in process_list]):
                printf({"msg": "All ETL processes completed execution with no error codes"})
                break
Ejemplo n.º 7
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            set_final_index_config(self.elasticsearch_client, self.config["index_name"])
            if self.config["skip_delete_index"]:
                printf({"msg": "Skipping deletion of old indices"})
            else:
                printf({"msg": "Closing old indices and adding aliases"})
                swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(self.elasticsearch_client, self.config["index_name"])
            printf({"msg": f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"})
            update_last_load_date(f"es_{self.config['load_type']}", self.config["processing_start_datetime"])
Ejemplo n.º 8
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            printf({"msg": "Closing old indices and adding aliases"})
            set_final_index_config(self.elasticsearch_client, self.config["index_name"])
            swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            msg = "Storing datetime {} for next incremental load"
            printf({"msg": msg.format(self.config["processing_start_datetime"])})
            update_last_load_date("es_transactions", self.config["processing_start_datetime"])
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_number = 0
        for fy in self.config["fiscal_years"]:
            job_number += 1
            index = self.config["index_name"]
            filename = "{dir}{fy}_transactions.csv".format(
                dir=self.config["directory"], fy=fy)

            new_job = DataJob(job_number, index, fy, filename)

            if os.path.exists(filename):
                os.remove(filename)
            download_queue.put(new_job)

        printf({"msg": "There are {} jobs to process".format(job_number)})

        process_list = []
        process_list.append(
            Process(
                name="Download Proccess",
                target=download_db_records,
                args=(download_queue, es_ingest_queue, self.config),
            ))
        process_list.append(
            Process(name="ES Index Process",
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config["provide_deleted"]:
            process_list.append(
                Process(name="S3 Deleted Records Scrapper Process",
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    "msg":
                    "Waiting to start ES ingest until S3 deletes are complete"
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    "msg":
                    "All ETL processes completed execution with no error codes"
                })
                break

        if self.config["reload_all"]:
            printf({"msg": "Closing old indices and adding aliases"})
            swap_aliases(ES, self.config["index_name"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(ES, self.config["index_name"],
                          settings.ES_REPOSITORY)
    def handle(self, *args, **options):
        """ Script execution of custom code starts in this method"""
        start = perf_counter()
        printf({"msg": "Starting script\n{}".format("=" * 56)})

        self.transform_cli_arguments(options)

        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**self.config)})

        self.controller()

        if self.config["is_incremental_load"]:
            printf({
                "msg":
                "Updating Last Load record with {}".format(
                    self.config["processing_start_datetime"])
            })
            update_last_load_date("es_transactions",
                                  self.config["processing_start_datetime"])
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({
            "msg":
            "Script completed in {} seconds".format(perf_counter() - start)
        })
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
Ejemplo n.º 11
0
    def handle(self, *args, **options):
        ''' Script execution of custom code starts in this method'''
        start = perf_counter()
        printf({'msg': 'Starting script\n{}'.format('=' * 56)})

        self.config = set_config()
        self.config['verbose'] = True if options['verbosity'] > 1 else False
        self.config['fiscal_years'] = options['fiscal_years']
        self.config['directory'] = options['dir'] + os.sep
        self.config['provide_deleted'] = options['deleted']
        self.config['stale'] = options['stale']
        self.config['swap'] = options['swap']
        self.config['keep'] = options['keep']
        self.config['snapshot'] = options['snapshot']
        self.config['index_name'] = options['index_name']

        mappingfile = os.path.join(
            settings.BASE_DIR,
            'usaspending_api/etl/es_transaction_mapping.json')
        with open(mappingfile) as f:
            mapping_dict = json.load(f)
            self.config['mapping'] = json.dumps(mapping_dict)
        self.config['doc_type'] = str(list(mapping_dict['mappings'].keys())[0])
        self.config['max_query_size'] = mapping_dict['settings'][
            'index.max_result_window']

        does_index_exist = ES.indices.exists(self.config['index_name'])

        if not does_index_exist:
            printf({
                'msg':
                '"{}" does not exist, skipping deletions for ths load,\
                             provide_deleted overwritten to False'.format(
                    self.config['index_name'])
            })
            self.config['provide_deleted'] = False

        if not options['since']:
            if not options['days']:
                # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date
                #   before the earliest records in S3 and when Postgres records were updated.
                #   Choose the beginning of FY2008, and made it timezone-award for S3
                self.config['starting_date'] = datetime.strptime(
                    '2007-10-01+0000', '%Y-%m-%d%z')
            else:
                # If --days is provided, go back X days into the past
                self.config['starting_date'] = datetime.now(
                    timezone.utc) - timedelta(days=options['days'])
        else:
            self.config['starting_date'] = datetime.strptime(
                options['since'] + '+0000', '%Y-%m-%d%z')

        if not os.path.isdir(self.config['directory']):
            printf({'msg': 'Provided directory does not exist'})
            raise SystemExit

        if does_index_exist and (not options['since'] and not options['days']):
            print('''
                  Bad mix of parameters! Index exists and
                  full data load implied. Choose a different
                  index_name or load a subset of data using --since
                  ''')
            raise SystemExit

        self.controller()
        printf({
            'msg':
            '---------------------------------------------------------------'
        })
        printf({
            'msg':
            'Script completed in {} seconds'.format(perf_counter() - start)
        })
        printf({
            'msg':
            '---------------------------------------------------------------'
        })
Ejemplo n.º 12
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename,
                                                               has_header=True,
                                                               safe=False)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(
            Process(name='Download Proccess',
                    target=download_db_records,
                    args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(
            Process(name='ES Index Process',
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(
                Process(name='S3 Deleted Records Scrapper Process',
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    'msg':
                    'All ETL processes completed execution with no error codes'
                })
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'],
                          settings.ES_REPOSITORY)
Ejemplo n.º 13
0
def process_cli_parameters(options: dict, es_client) -> dict:
    default_datetime = datetime.strptime(
        f"{settings.API_SEARCH_MIN_DATE}+0000", "%Y-%m-%d%z")
    simple_args = (
        "skip_delete_index",
        "process_deletes",
        "create_new_index",
        "snapshot",
        "index_name",
        "directory",
        "skip_counts",
        "load_type",
    )
    config = set_config(simple_args, options)

    config["fiscal_years"] = fiscal_years_for_processing(options)
    config["directory"] = Path(config["directory"]).resolve()

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: --create-new-index requires --index-name.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = default_datetime
        check_new_index_name_is_ok(
            config["index_name"],
            settings.ES_AWARDS_NAME_SUFFIX if config["load_type"] == "awards"
            else settings.ES_TRANSACTIONS_NAME_SUFFIX,
        )
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-award for S3
        config["starting_date"] = get_last_load_date(
            f"es_{options['load_type']}", default=default_datetime)

    config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW
    if options["load_type"] == "awards":
        config["max_query_size"] = settings.ES_AWARDS_MAX_RESULT_WINDOW

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != default_datetime)

    if config["is_incremental_load"]:
        write_alias = settings.ES_TRANSACTIONS_WRITE_ALIAS
        if config["load_type"] == "awards":
            write_alias = settings.ES_AWARDS_WRITE_ALIAS
        if config["index_name"]:
            printf({
                "msg":
                f"Ignoring provided index name, using alias '{write_alias}' for incremental load"
            })
        config["index_name"] = write_alias
        if not es_client.cat.aliases(name=write_alias):
            printf({
                "msg":
                f"Fatal error: write alias '{write_alias}' is missing"
            })
            raise SystemExit(1)
        # Force manual refresh for atomic transaction-like delete/re-add consistency during incremental load.
        # Turned back on at end.
        toggle_refresh_off(es_client, config["index_name"])
    else:
        if es_client.indices.exists(config["index_name"]):
            printf({
                "msg":
                "Fatal error: data load into existing index. Change index name or run an incremental load"
            })
            raise SystemExit(1)

    if not config["directory"].is_dir():
        printf({"msg": "Fatal error: provided directory does not exist"})
        raise SystemExit(1)
    elif config["starting_date"] < default_datetime:
        printf({
            "msg":
            f"Fatal error: --start-datetime is too early. Set no earlier than {default_datetime}"
        })
        raise SystemExit(1)
    elif not config["is_incremental_load"] and config["process_deletes"]:
        printf({
            "msg":
            "Skipping deletions for ths load, --deleted overwritten to False"
        })
        config["process_deletes"] = False

    config["ingest_wait"] = options["idle_wait_time"]

    return config
def process_cli_parameters(options: dict, es_client) -> None:
    default_datetime = datetime.strptime(
        "{}+0000".format(settings.API_SEARCH_MIN_DATE), "%Y-%m-%d%z")
    simple_args = (
        "process_deletes",
        "create_new_index",
        "snapshot",
        "index_name",
        "directory",
        "skip_counts",
        "skip_delete_index",
    )
    config = set_config(simple_args, options)

    config["fiscal_years"] = fiscal_years_for_processing(options)
    config["directory"] = Path(config["directory"]).resolve()

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: --create-new-index requires --index-name.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = default_datetime
        check_new_index_name_is_ok(config["index_name"])
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-award for S3
        config["starting_date"] = get_last_load_date("es_transactions",
                                                     default=default_datetime)

    config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != default_datetime)

    if config["is_incremental_load"]:
        if config["index_name"]:
            msg = "Ignoring provided index name, using alias '{}' for incremental load"
            printf({"msg": msg.format(settings.ES_TRANSACTIONS_WRITE_ALIAS)})
        config["index_name"] = settings.ES_TRANSACTIONS_WRITE_ALIAS
        if not es_client.cat.aliases(
                name=settings.ES_TRANSACTIONS_WRITE_ALIAS):
            printf({
                "msg":
                "Fatal error: write alias '{}' is missing".format(
                    settings.ES_TRANSACTIONS_WRITE_ALIAS)
            })
            raise SystemExit(1)
    else:
        if es_client.indices.exists(config["index_name"]):
            printf({
                "msg":
                "Fatal error: data load into existing index. Change index name or run an incremental load"
            })
            raise SystemExit(1)

    if not config["directory"].is_dir():
        printf({"msg": "Fatal error: provided directory does not exist"})
        raise SystemExit(1)
    elif config["starting_date"] < default_datetime:
        printf({
            "msg":
            "Fatal error: --start-datetime is too early. Set no earlier than {}"
            .format(default_datetime)
        })
        raise SystemExit(1)
    elif not config["is_incremental_load"] and config["process_deletes"]:
        printf({
            "msg":
            "Skipping deletions for ths load, --deleted overwritten to False"
        })
        config["process_deletes"] = False

    return config
Ejemplo n.º 15
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            10)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                award_category = AWARD_DESC_CATEGORIES[awd_cat_idx]
                index = '{}-{}-{}'.format(settings.TRANSACTIONS_INDEX_ROOT,
                                          award_category, fy)
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = csv_row_count(filename)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        if self.config['provide_deleted']:
            s3_delete_process = Process(target=deleted_transactions,
                                        args=(ES, self.config))
        download_proccess = Process(target=download_db_records,
                                    args=(download_queue, es_ingest_queue,
                                          self.config))
        es_index_process = Process(target=es_data_loader,
                                   args=(ES, download_queue, es_ingest_queue,
                                         self.config))

        download_proccess.start()

        if self.config['provide_deleted']:
            s3_delete_process.start()
            while s3_delete_process.is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        es_index_process.start()

        if self.config['provide_deleted']:
            s3_delete_process.join()
        download_proccess.join()
        es_index_process.join()
    def handle(self, *args, **options):
        ''' Script execution of custom code starts in this method'''
        start = perf_counter()
        printf({'msg': 'Starting script\n{}'.format('=' * 56)})

        self.config = set_config()
        self.config['verbose'] = True if options['verbosity'] > 1 else False
        self.config['fiscal_years'] = options['fiscal_years']
        self.config['directory'] = options['dir'] + os.sep
        self.config['provide_deleted'] = options['deleted']
        self.config['stale'] = options['stale']
        self.config['swap'] = options['swap']
        self.config['keep'] = options['keep']
        self.config['snapshot'] = options['snapshot']
        self.config['index_name'] = options['index_name']

        mappingfile = os.path.join(settings.BASE_DIR, 'usaspending_api/etl/es_transaction_mapping.json')
        with open(mappingfile) as f:
            mapping_dict = json.load(f)
            self.config['mapping'] = json.dumps(mapping_dict)
        self.config['doc_type'] = str(list(mapping_dict['mappings'].keys())[0])
        self.config['max_query_size'] = mapping_dict['settings']['index.max_result_window']

        does_index_exist = ES.indices.exists(self.config['index_name'])

        if not does_index_exist:
            printf({'msg': '"{}" does not exist, skipping deletions for ths load,\
                             provide_deleted overwritten to False'.format(self.config['index_name'])})
            self.config['provide_deleted'] = False

        if not options['since']:
            if not options['days']:
                # Due to the queries used for fetching postgres data, `starting_date` needs to be present and a date
                #   before the earliest records in S3 and when Postgres records were updated.
                #   Choose the beginning of FY2008, and made it timezone-award for S3
                self.config['starting_date'] = datetime.strptime('2007-10-01+0000', '%Y-%m-%d%z')
            else:
                # If --days is provided, go back X days into the past
                self.config['starting_date'] = datetime.now(timezone.utc) - timedelta(days=options['days'])
        else:
            self.config['starting_date'] = datetime.strptime(options['since'] + '+0000', '%Y-%m-%d%z')

        if not os.path.isdir(self.config['directory']):
            printf({'msg': 'Provided directory does not exist'})
            raise SystemExit

        if does_index_exist and (not options['since'] and not options['days']):
            print('''
                  Bad mix of parameters! Index exists and
                  full data load implied. Choose a different
                  index_name or load a subset of data using --since
                  ''')
            raise SystemExit

        self.controller()
        printf({'msg': '---------------------------------------------------------------'})
        printf({'msg': 'Script completed in {} seconds'.format(perf_counter() - start)})
        printf({'msg': '---------------------------------------------------------------'})
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False)
                        printf({
                            'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count),
                            'job': new_job.name,
                            'f': 'Download'})
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(Process(
            name='Download Proccess',
            target=download_db_records,
            args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(Process(
            name='ES Index Process',
            target=es_data_loader,
            args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(Process(
                name='S3 Deleted Records Scrapper Process',
                target=deleted_transactions,
                args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({'msg': 'All ETL processes completed execution with no error codes'})
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)