Python read_csv_data Examples, sagemaker_sklearn_extension.externals.read_data.read_csv_data Python Examples

Example #1

0

Show file

File: test_read_data.py Project: swlkn/sagemaker-scikit-learn-extension

def test_read_csv_data_split_object():
    X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="O")
    yX = read_csv_data(LARGE_DATA_4MB, output_dtype="O")
    assert X.shape == (38223, 20)
    assert y.shape == (38223,)
    assert np.array_equal(np.hstack((y.reshape(-1, 1), X)), yX)
    assert X.dtype.kind == "O"
    assert y.dtype.kind == "O"

Example #2

0

Show file

def test_excel_dialect(csv_data_dir):
    """Test that read_csv_data function properly reads files in the excel dialect."""
    generated_contents = read_csv_data(source=csv_data_dir + "/file_1.csv")

    assert generated_contents.shape == (len(csv1), len(csv1[0]))
    assert np.all(generated_contents == np.array(
        [[str(v) for v in row] for row in csv1], dtype=np.str))

Example #3

0

Show file

def test_directory_content(csv_data_dir):
    """Test that read_csv_data function reads content correctly from a directory"""
    generated_contents = read_csv_data(source=csv_data_dir)
    correct_array = csv1 + csv2
    assert generated_contents.shape == (len(correct_array),
                                        len(correct_array[0]))
    assert np.all(generated_contents == np.array(
        [[str(v) for v in row] for row in correct_array], dtype=np.str))

Example #4

0

Show file

def test_read_csv_data(data_file, shape):
    """Test for reading individual csv data files"""
    array = read_csv_data(source=data_file,
                          batch_size=1,
                          fit_memory_percent=100.0,
                          output_dtype="U")
    assert array.shape == shape
    assert array.dtype.kind in {"U", "S"}

Example #5

0

Show file

File: test_read_data.py Project: swlkn/sagemaker-scikit-learn-extension

def test_read_csv_data_inmemory_mode():
    """Test to make sure 'InMemory' mode reads in content correctly"""
    generated_contents = read_csv_data(source=BUFFER_DATA.encode())
    correct_array = []
    for i in range(8):
        correct_array.append([i * 4 + j for j in range(1, 5)])
    assert generated_contents.shape == (len(correct_array), len(correct_array[0]))
    assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str))

Example #6

0

Show file

File: test_read_data.py Project: swlkn/sagemaker-scikit-learn-extension

def test_read_csv_data_split_limited():
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    X, y = read_csv_data(
        LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
    )
    assert _convert_megabytes_to_bytes(1.9) < (X.nbytes + y.nbytes) <= two_mb_in_bytes
    assert X.dtype.kind == "U"
    assert y.dtype.kind == "U"

Example #7

0

Show file

File: test_read_data.py Project: swlkn/sagemaker-scikit-learn-extension

def test_read_csv_data_samples():
    """Test for sample case where the entire dataset doesn't fit into the available memory"""
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    sample_data = read_csv_data(
        source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
    )
    assert sample_data.dtype.kind == "U"
    assert _convert_megabytes_to_bytes(1.9) < sample_data.nbytes <= two_mb_in_bytes

Example #8

0

Show file

File: test_read_data.py Project: swlkn/sagemaker-scikit-learn-extension

def test_read_csv_data_split_limited_object():
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    X, y = read_csv_data(
        LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="O"
    )
    arrays_memory = _get_size_total(X) + _get_size_total(y)
    assert _convert_megabytes_to_bytes(1.9) < arrays_memory <= two_mb_in_bytes
    assert X.dtype.kind == "O"
    assert y.dtype.kind == "O"

Example #9

0

Show file

def test_read_csv_data_sample_append():
    """Test for reading data in chunks."""
    array = read_csv_data(source=LARGE_DATA_4MB, fit_memory_percent=100.0)
    assert array.shape == (38223, 21)

Example #10

0

Show file

def test_read_csv_data_directory():
    """Test for reading from a directory of data"""
    array = read_csv_data(source="test/data/csv/mock_datasplitter_output",
                          fit_memory_percent=100.0)
    assert array.shape == (22, 4)

Example #11

0

Show file

def test_read_csv_data_invalid_csv():
    with pytest.raises(InvalidInstanceError):
        read_csv_data(source="test/data/csv/invalid.csv")

Example #12

0

Show file

def test_read_empty_buffer():
    """Test for getting an empty array if the buffer is empty"""
    generated_contents = read_csv_data(source="".encode())
    assert generated_contents.size == 0

Example #13

0

Show file

    # load feature processor from processor module
    feature_transformer = processor.build_feature_transform()

    # customize global feature transform step
    feature_transformer = update_feature_transformer(header,
                                                     feature_transformer)

    # load label processor from processor module
    # absence of label processor implies that the labels are not processed
    try:
        label_transformer = processor.build_label_transform()
    except AttributeError:
        label_transformer = None

    X, y = read_csv_data(source=args.data_dir,
                         target_column_index=header.target_column_index,
                         output_dtype='O')
    logging.info('Feature data shape: {}'.format(X.shape))

    model = train(X,
                  y,
                  header=header,
                  feature_transformer=feature_transformer,
                  label_transformer=label_transformer)

    # serialize the model to model_dir
    dump(model, filename=os.path.join(args.model_dir, 'model.joblib'))

    # serialize the inference code to the model/code.
    serialize_code(os.path.join(args.model_dir, 'code'), processor.__file__)