Exemple #1
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname),
            input_date
        )
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file,
                                              expected_out_file,
                                              cores,
                                              extractions,
                                              delimiter)
        output_under_test = create_emr_args(input_date, 10,
                                            input_prefix, dev)
        assert output_under_test == formatted_args
Exemple #2
0
def test_create_emr_args(input_date, dev, cores, pipeline_yaml):
    print "just starting"
    load_package_config('config.yaml')
    YamlConfiguration(pipeline_yaml)

    input_prefix = read_list('pipeline.et_step.s3_prefixes')[0]
    input_file = input_prefix + input_date + '/part-*.gz'

    expected_args = EXPECTED_DEV_ARGS if dev else EXPECTED_AWS_ARGS
    expected_out_file = read_string('pipeline.s3_output_prefix')
    delimiter = read_string('redshift_column_delimiter')
    with mock.patch.dict(os.environ, {'LOGNAME': 'testuser', 'YELPCODE': '.'}):
        logname = os.environ['LOGNAME']
        expected_out_file = os.path.join(
            expected_out_file.format(logname=logname), input_date)
        extractions = pipeline_yaml_schema_file_path()
        formatted_args = expected_args.format(input_file, expected_out_file,
                                              cores, extractions, delimiter)
        output_under_test = create_emr_args(input_date, 10, input_prefix, dev)
        assert output_under_test == formatted_args
Exemple #3
0
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(
            stream_name,
            args.run_local,
            's3_to_redshift',
            job_name='load',
            input_date=input_date
        )
        logs_to_copy = [
            (join(s3_log_prefix, input_date, table), table)
            for (table, _) in create_tuples
        ]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)

if __name__ == '__main__':

    args_namespace = parse_command_line(sys.argv)

    load_package_config(args_namespace.config)
    YamlConfiguration(args_namespace.io_yaml, optional=False)
    if args_namespace.config_override:
        YamlConfiguration(args_namespace.config_override, optional=False)

    s3_to_redshift_main(args_namespace)
Exemple #4
0
                                       start_time_secs=time.time(),
                                       error_msg=repr(e))
            raise
    elif args.date is not None:
        raise IOError("{0} data is either already loaded \
or has not yet completed ET step".format(args.date))

    logs_to_copy = []
    for input_date in data_candidates:
        LOG_STREAM = PipelineStreamLogger(stream_name,
                                          args.run_local,
                                          's3_to_redshift',
                                          job_name='load',
                                          input_date=input_date)
        logs_to_copy = [(join(s3_log_prefix, input_date, table), table)
                        for (table, _) in create_tuples]
        copy_tables(loader_psql, status_table, db, input_date, logs_to_copy,
                    args.ttl_days, LOG_STREAM)


if __name__ == '__main__':

    args_namespace = parse_command_line(sys.argv)

    load_package_config(args_namespace.config)
    YamlConfiguration(args_namespace.io_yaml, optional=False)
    if args_namespace.config_override:
        YamlConfiguration(args_namespace.config_override, optional=False)

    s3_to_redshift_main(args_namespace)
Exemple #5
0
def merge_configs(fname_list):
    base_fname = fname_list.pop(0)
    config = load_package_config(base_fname)
    for fname in fname_list:
        YamlConfiguration(fname, optional=True)
    return config