def test_read_csv_data_split_object():
    X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="O")
    yX = read_csv_data(LARGE_DATA_4MB, output_dtype="O")
    assert X.shape == (38223, 20)
    assert y.shape == (38223,)
    assert np.array_equal(np.hstack((y.reshape(-1, 1), X)), yX)
    assert X.dtype.kind == "O"
    assert y.dtype.kind == "O"
Example #2
0
def test_excel_dialect(csv_data_dir):
    """Test that read_csv_data function properly reads files in the excel dialect."""
    generated_contents = read_csv_data(source=csv_data_dir + "/file_1.csv")

    assert generated_contents.shape == (len(csv1), len(csv1[0]))
    assert np.all(generated_contents == np.array(
        [[str(v) for v in row] for row in csv1], dtype=np.str))
Example #3
0
def test_directory_content(csv_data_dir):
    """Test that read_csv_data function reads content correctly from a directory"""
    generated_contents = read_csv_data(source=csv_data_dir)
    correct_array = csv1 + csv2
    assert generated_contents.shape == (len(correct_array),
                                        len(correct_array[0]))
    assert np.all(generated_contents == np.array(
        [[str(v) for v in row] for row in correct_array], dtype=np.str))
Example #4
0
def test_read_csv_data(data_file, shape):
    """Test for reading individual csv data files"""
    array = read_csv_data(source=data_file,
                          batch_size=1,
                          fit_memory_percent=100.0,
                          output_dtype="U")
    assert array.shape == shape
    assert array.dtype.kind in {"U", "S"}
def test_read_csv_data_inmemory_mode():
    """Test to make sure 'InMemory' mode reads in content correctly"""
    generated_contents = read_csv_data(source=BUFFER_DATA.encode())
    correct_array = []
    for i in range(8):
        correct_array.append([i * 4 + j for j in range(1, 5)])
    assert generated_contents.shape == (len(correct_array), len(correct_array[0]))
    assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str))
def test_read_csv_data_split_limited():
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    X, y = read_csv_data(
        LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
    )
    assert _convert_megabytes_to_bytes(1.9) < (X.nbytes + y.nbytes) <= two_mb_in_bytes
    assert X.dtype.kind == "U"
    assert y.dtype.kind == "U"
def test_read_csv_data_samples():
    """Test for sample case where the entire dataset doesn't fit into the available memory"""
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    sample_data = read_csv_data(
        source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
    )
    assert sample_data.dtype.kind == "U"
    assert _convert_megabytes_to_bytes(1.9) < sample_data.nbytes <= two_mb_in_bytes
def test_read_csv_data_split_limited_object():
    total_memory_in_bytes = psutil.virtual_memory().total
    two_mb_in_bytes = _convert_megabytes_to_bytes(2)
    fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
    X, y = read_csv_data(
        LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="O"
    )
    arrays_memory = _get_size_total(X) + _get_size_total(y)
    assert _convert_megabytes_to_bytes(1.9) < arrays_memory <= two_mb_in_bytes
    assert X.dtype.kind == "O"
    assert y.dtype.kind == "O"
Example #9
0
def test_read_csv_data_sample_append():
    """Test for reading data in chunks."""
    array = read_csv_data(source=LARGE_DATA_4MB, fit_memory_percent=100.0)
    assert array.shape == (38223, 21)
Example #10
0
def test_read_csv_data_directory():
    """Test for reading from a directory of data"""
    array = read_csv_data(source="test/data/csv/mock_datasplitter_output",
                          fit_memory_percent=100.0)
    assert array.shape == (22, 4)
Example #11
0
def test_read_csv_data_invalid_csv():
    with pytest.raises(InvalidInstanceError):
        read_csv_data(source="test/data/csv/invalid.csv")
Example #12
0
def test_read_empty_buffer():
    """Test for getting an empty array if the buffer is empty"""
    generated_contents = read_csv_data(source="".encode())
    assert generated_contents.size == 0
Example #13
0
    # load feature processor from processor module
    feature_transformer = processor.build_feature_transform()

    # customize global feature transform step
    feature_transformer = update_feature_transformer(header,
                                                     feature_transformer)

    # load label processor from processor module
    # absence of label processor implies that the labels are not processed
    try:
        label_transformer = processor.build_label_transform()
    except AttributeError:
        label_transformer = None

    X, y = read_csv_data(source=args.data_dir,
                         target_column_index=header.target_column_index,
                         output_dtype='O')
    logging.info('Feature data shape: {}'.format(X.shape))

    model = train(X,
                  y,
                  header=header,
                  feature_transformer=feature_transformer,
                  label_transformer=label_transformer)

    # serialize the model to model_dir
    dump(model, filename=os.path.join(args.model_dir, 'model.joblib'))

    # serialize the inference code to the model/code.
    serialize_code(os.path.join(args.model_dir, 'code'), processor.__file__)