Exemple #1
0
def download_csv(count_sql, copy_sql, filename, job_id, fast, verbose):
    if fast:
        count = None
        printf({
            "msg":
            "Skipping count checks. Writing file: {}".format(filename),
            "job":
            job_id,
            "f":
            "Download"
        })
    else:
        count = execute_sql_statement(count_sql, True, verbose)[0]["count"]
        printf({
            "msg": "Writing {} to this file: {}".format(count, filename),
            "job": job_id,
            "f": "Download"
        })
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen('psql "${{DATABASE_URL}}" -c {}'.format(copy_sql),
                     shell=True).wait()

    if not fast:
        download_count = count_rows_in_csv_file(filename,
                                                has_header=True,
                                                safe=False)
        if count != download_count:
            msg = "Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}"
            printf({
                "msg": msg.format(count, download_count, filename),
                "job": job_id,
                "f": "Download"
            })
            raise SystemExit(1)
    return count
Exemple #2
0
def download_csv(count_sql, copy_sql, filename, job_id, verbose):
    count = execute_sql_statement(count_sql, True, verbose)[0]['count']
    printf({
        'msg':
        'Writing {} transactions to this file: {}'.format(count, filename),
        'job':
        job_id,
        'f':
        'Download'
    })
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen('psql "${{DATABASE_URL}}" -c {}'.format(copy_sql),
                     shell=True).wait()

    download_count = count_rows_in_csv_file(filename,
                                            has_header=True,
                                            safe=False)
    if count != download_count:
        msg = 'Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}'
        printf({
            'msg': msg.format(count, download_count, filename),
            'job': job_id,
            'f': 'Download'
        })
        raise SystemExit(1)
    return count
    def create_local_file(self, award_type, source, agency_code,
                          generate_since):
        """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """
        logger.info('Generating CSV file with creations and modifications')

        # Create file paths and working directory
        timestamp = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S%f')
        working_dir = '{}_{}_delta_gen_{}/'.format(settings.CSV_LOCAL_PATH,
                                                   agency_code, timestamp)
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        source_name = '{}_{}_Delta_{}'.format(
            agency_code, award_type, datetime.strftime(date.today(), '%Y%m%d'))
        source_path = os.path.join(working_dir, '{}.csv'.format(source_name))

        # Create a unique temporary file with the raw query
        raw_quoted_query = generate_raw_quoted_query(
            source.row_emitter(None))  # None requests all headers
        csv_query_annotated = self.apply_annotations_to_sql(
            raw_quoted_query, source.human_names)
        (temp_sql_file,
         temp_sql_file_path) = tempfile.mkstemp(prefix='bd_sql_', dir='/tmp')
        with open(temp_sql_file_path, 'w') as file:
            file.write('\\copy ({}) To STDOUT with CSV HEADER'.format(
                csv_query_annotated))

        logger.info('Generated temp SQL file {}'.format(temp_sql_file_path))
        # Generate the csv with \copy
        cat_command = subprocess.Popen(['cat', temp_sql_file_path],
                                       stdout=subprocess.PIPE)
        try:
            subprocess.check_output([
                'psql', '-o', source_path, os.environ['DOWNLOAD_DATABASE_URL'],
                '-v', 'ON_ERROR_STOP=1'
            ],
                                    stdin=cat_command.stdout,
                                    stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as e:
            logger.exception(e.output)
            raise e

        # Append deleted rows to the end of the file
        self.add_deletion_records(source_path, working_dir, award_type,
                                  agency_code, source, generate_since)
        if count_rows_in_csv_file(source_path, has_header=True, safe=True) > 0:
            # Split the CSV into multiple files and zip it up
            zipfile_path = '{}{}.zip'.format(settings.CSV_LOCAL_PATH,
                                             source_name)

            logger.info('Creating compressed file: {}'.format(
                os.path.basename(zipfile_path)))
            split_and_zip_csvs(zipfile_path, source_path, source_name)
        else:
            zipfile_path = None

        os.close(temp_sql_file)
        os.remove(temp_sql_file_path)
        shutil.rmtree(working_dir)

        return zipfile_path
Exemple #4
0
def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit):
    """Write to csv and zip files using the source data"""
    d_map = {
        "d1": "contracts",
        "d2": "assistance",
        "treasury_account": "treasury_account",
        "federal_account": "federal_account",
    }
    if download_job and download_job.monthly_download:
        # Use existing detailed filename from parent file for monthly files
        # e.g. `019_Assistance_Delta_20180917_%s.csv`
        source_name = strip_file_extension(download_job.file_name)
    elif source.is_for_idv or source.is_for_contract:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(piid=slugify_text_for_file_names(piid, "UNKNOWN", 50))
    elif source.is_for_assistance:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50))
    else:
        source_name = "{}_{}_{}".format(
            source.agency_code, d_map[source.file_type], VALUE_MAPPINGS[source.source_type]["download_name"]
        )
    source_query = source.row_emitter(columns)
    source.file_name = "{}.csv".format(source_name)
    source_path = os.path.join(working_dir, source.file_name)

    write_to_log(message="Preparing to download data as {}".format(source_name), download_job=download_job)

    # Generate the query file; values, limits, dates fixed
    temp_file, temp_file_path = generate_temp_query_file(source_query, limit, source, download_job, columns)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql, args=(temp_file_path, source_path, download_job))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job)

        # Log how many rows we have
        write_to_log(message="Counting rows in CSV", download_job=download_job)
        try:
            download_job.number_of_rows += count_rows_in_csv_file(filename=source_path, has_header=True)
        except Exception:
            write_to_log(message="Unable to obtain CSV line count", is_error=True, download_job=download_job)
        download_job.save()

        # Create a separate process to split the large csv into smaller csvs and write to zip; wait
        zip_process = multiprocessing.Process(
            target=split_and_zip_csvs, args=(zip_file_path, source_path, source_name, download_job)
        )
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)
def download_csv(count_sql, copy_sql, filename, job_id, verbose):
    count = execute_sql_statement(count_sql, True, verbose)[0]['count']
    printf({'msg': 'Writing {} transactions to this file: {}'.format(count, filename), 'job': job_id, 'f': 'Download'})
    # It is preferable to not use shell=True, but this command works. Limited user-input so risk is low
    subprocess.Popen('psql "${{DATABASE_URL}}" -c {}'.format(copy_sql), shell=True).wait()

    download_count = count_rows_in_csv_file(filename, has_header=True, safe=False)
    if count != download_count:
        msg = 'Mismatch between CSV and DB rows! Expected: {} | Actual {} in: {}'
        printf({
            'msg': msg.format(count, download_count, filename),
            'job': job_id,
            'f': 'Download'})
        raise SystemExit(1)
    return count
    def create_local_file(self, award_type, source, agency_code, generate_since):
        """ Generate complete file from SQL query and S3 bucket deletion files, then zip it locally """
        logger.info('Generating CSV file with creations and modifications')

        # Create file paths and working directory
        timestamp = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S%f')
        working_dir = '{}_{}_delta_gen_{}/'.format(settings.CSV_LOCAL_PATH, agency_code, timestamp)
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        source_name = '{}_{}_Delta_{}'.format(agency_code, award_type, datetime.strftime(date.today(), '%Y%m%d'))
        source_path = os.path.join(working_dir, '{}.csv'.format(source_name))

        # Create a unique temporary file with the raw query
        raw_quoted_query = generate_raw_quoted_query(source.row_emitter(None))  # None requests all headers
        csv_query_annotated = self.apply_annotations_to_sql(raw_quoted_query, source.human_names)
        (temp_sql_file, temp_sql_file_path) = tempfile.mkstemp(prefix='bd_sql_', dir='/tmp')
        with open(temp_sql_file_path, 'w') as file:
            file.write('\\copy ({}) To STDOUT with CSV HEADER'.format(csv_query_annotated))

        # Generate the csv with \copy
        cat_command = subprocess.Popen(['cat', temp_sql_file_path], stdout=subprocess.PIPE)
        subprocess.check_output(['psql', '-o', source_path, os.environ['DOWNLOAD_DATABASE_URL'], '-v',
                                 'ON_ERROR_STOP=1'], stdin=cat_command.stdout, stderr=subprocess.STDOUT)

        # Append deleted rows to the end of the file
        self.add_deletion_records(source_path, working_dir, award_type, agency_code, source, generate_since)
        if count_rows_in_csv_file(source_path, has_header=True, safe=True) > 0:
            # Split the CSV into multiple files and zip it up
            zipfile_path = '{}{}.zip'.format(settings.CSV_LOCAL_PATH, source_name)

            logger.info('Creating compressed file: {}'.format(os.path.basename(zipfile_path)))
            split_and_zip_csvs(zipfile_path, source_path, source_name)
        else:
            zipfile_path = None

        os.close(temp_sql_file)
        os.remove(temp_sql_file_path)
        shutil.rmtree(working_dir)

        return zipfile_path
Exemple #7
0
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(
            20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename,
                                                               has_header=True,
                                                               safe=False)
                        printf({
                            'msg':
                            'Using existing file: {} | count {}'.format(
                                filename, new_job.count),
                            'job':
                            new_job.name,
                            'f':
                            'Download'
                        })
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(
            Process(name='Download Proccess',
                    target=download_db_records,
                    args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(
            Process(name='ES Index Process',
                    target=es_data_loader,
                    args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(
                Process(name='S3 Deleted Records Scrapper Process',
                        target=deleted_transactions,
                        args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({
                    'msg':
                    'Waiting to start ES ingest until S3 deletes are complete'
                })
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({
                    'msg':
                    'All ETL processes completed execution with no error codes'
                })
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'],
                          settings.ES_REPOSITORY)
    def controller(self):

        download_queue = Queue()  # Queue for jobs whch need a csv downloaded
        es_ingest_queue = Queue(20)  # Queue for jobs which have a csv and are ready for ES ingest

        job_id = 0
        for fy in self.config['fiscal_years']:
            for awd_cat_idx in AWARD_DESC_CATEGORIES.keys():
                job_id += 1
                index = self.config['index_name']
                filename = '{dir}{fy}_transactions_{type}.csv'.format(
                    dir=self.config['directory'],
                    fy=fy,
                    type=awd_cat_idx.replace(' ', ''))

                new_job = DataJob(job_id, index, fy, awd_cat_idx, filename)

                if os.path.exists(filename):
                    # This is mostly for testing. If previous CSVs still exist skip the download for that file
                    if self.config['stale']:
                        new_job.count = count_rows_in_csv_file(filename, has_header=True, safe=False)
                        printf({
                            'msg': 'Using existing file: {} | count {}'.format(filename, new_job.count),
                            'job': new_job.name,
                            'f': 'Download'})
                        # Add job directly to the Elasticsearch ingest queue since the CSV exists
                        es_ingest_queue.put(new_job)
                        continue
                    else:
                        os.remove(filename)
                download_queue.put(new_job)

        printf({'msg': 'There are {} jobs to process'.format(job_id)})

        process_list = []
        process_list.append(Process(
            name='Download Proccess',
            target=download_db_records,
            args=(download_queue, es_ingest_queue, self.config)))
        process_list.append(Process(
            name='ES Index Process',
            target=es_data_loader,
            args=(ES, download_queue, es_ingest_queue, self.config)))

        process_list[0].start()  # Start Download process

        if self.config['provide_deleted']:
            process_list.append(Process(
                name='S3 Deleted Records Scrapper Process',
                target=deleted_transactions,
                args=(ES, self.config)))
            process_list[-1].start()  # start S3 csv fetch proces
            while process_list[-1].is_alive():
                printf({'msg': 'Waiting to start ES ingest until S3 deletes are complete'})
                sleep(7)

        process_list[1].start()  # start ES ingest process

        while True:
            sleep(10)
            if process_guarddog(process_list):
                raise SystemExit(1)
            elif all([not x.is_alive() for x in process_list]):
                printf({'msg': 'All ETL processes completed execution with no error codes'})
                break

        if self.config['swap']:
            printf({'msg': 'Closing old indices and adding aliases'})
            swap_aliases(ES, self.config['index_name'])

        if self.config['snapshot']:
            printf({'msg': 'Taking snapshot'})
            take_snapshot(ES, self.config['index_name'], settings.ES_REPOSITORY)
def parse_source(source, columns, download_job, working_dir, start_time,
                 message, zipfile_path, limit):
    """Write to csv and zip files using the source data"""
    d_map = {
        'd1': 'contracts',
        'd2': 'assistance',
        'treasury_account': 'treasury_account',
        'federal_account': 'federal_account'
    }
    if download_job and download_job.monthly_download:
        # Use existing detailed filename from parent file for monthly files
        # e.g. `019_Assistance_Delta_20180917_%s.csv`
        source_name = strip_file_extension(download_job.file_name)
    else:
        source_name = '{}_{}_{}'.format(
            source.agency_code, d_map[source.file_type],
            VALUE_MAPPINGS[source.source_type]['download_name'])
    source_query = source.row_emitter(columns)
    source_path = os.path.join(working_dir, '{}.csv'.format(source_name))

    # Generate the query file; values, limits, dates fixed
    temp_file, temp_file_path = generate_temp_query_file(
        source_query, limit, source, download_job, columns)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql,
                                               args=(
                                                   temp_file_path,
                                                   source_path,
                                                   download_job,
                                               ))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job, message)

        # The process below modifies the download job and thus cannot be in a separate process
        # Assuming the process to count the number of lines in a CSV file takes less than DOWNLOAD_VISIBILITY_TIMEOUT
        #  in case the visibility times out before then
        if message:
            message.change_visibility(
                VisibilityTimeout=DOWNLOAD_VISIBILITY_TIMEOUT)

        # Log how many rows we have
        try:
            download_job.number_of_rows += count_rows_in_csv_file(
                filename=source_path, has_header=True)
        except Exception as e:
            write_to_log(message="Unable to obtain CSV line count",
                         is_error=True,
                         download_job=download_job)
        download_job.save()

        # Create a separate process to split the large csv into smaller csvs and write to zip; wait
        zip_process = multiprocessing.Process(target=split_and_zip_csvs,
                                              args=(zipfile_path, source_path,
                                                    source_name, download_job))
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job, message)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)