Example #1
0
def test_distance_func_with_kwarg_works(tmpdir, sample_config):
    sample_config['DYESCORE_DATA_DIR'] = tmpdir.strpath
    ds = DyeScore(write_config_file(tmpdir, sample_config))

    random_array = np.random.rand(5, 2)
    snippet_ids = ['0', '1', '2', '3', '4']  # 0 index for sanity :D
    data = xr.DataArray(random_array,
                        coords={
                            'snippet': snippet_ids,
                            'symbol': ['window.navigator', 'canvas.context'],
                        },
                        dims=('snippet', 'symbol'))
    f = ds.dye_score_data_file('snippets')
    data.to_dataset(name='data').to_zarr(store=ds.get_zarr_store(f))

    # Run Test
    dye_snippets = ['2']
    weights = [0.4, 0.6]
    kwargs = dict(w=weights)
    result_file = ds.compute_distances_for_dye_snippets(
        dye_snippets, distance_function='cosine', override=True, **kwargs)

    # Check Results
    results = xr.open_zarr(store=ds.get_zarr_store(result_file))['data']
    assert results.shape == (5, 1)
    for s in snippet_ids:
        actual_result = results.sel(snippet=s, dye_snippet='2').values
        expected_result = cosine(random_array[2],
                                 random_array[int(s)],
                                 w=weights)
        assert actual_result == expected_result
Example #2
0
def test_data_validation_with_invalid_file(tmpdir, sample_config):
    # Set-up invalid data file and save config
    data_file = os.path.join(tmpdir, 'data.csv')
    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv(data_file)
    sample_config['INPUT_PARQUET_LOCATION'] = data_file
    config_file = write_config_file(tmpdir, sample_config)
    ds = DyeScore(config_file)
    # Test
    with pytest.raises(ArrowIOError):
        ds.validate_input_data()
Example #3
0
def test_data_validation_with_valid_file(tmpdir, sample_config):
    # Set-up valid data file and save config
    data_file = os.path.join(tmpdir, 'data.parquet')
    daskify(
        pd.DataFrame({
            'top_level_url': ['a', 'b'],
            'document_url': ['a', 'b'],
            'script_url': ['c', 'd'],
            'symbol': ['e', 'f'],
            'func_name': ['g', 'h']
        })).to_parquet(data_file)
    sample_config['INPUT_PARQUET_LOCATION'] = data_file
    config_file = write_config_file(tmpdir, sample_config)
    ds = DyeScore(config_file)
    # Test
    assert ds.validate_input_data() is True
Example #4
0
def test_passing_unavailable_string_fails(tmpdir, sample_config):
    # Setup
    sample_config['DYESCORE_DATA_DIR'] = tmpdir.strpath
    ds = DyeScore(write_config_file(tmpdir, sample_config))
    random_array = np.random.rand(5, 2)
    snippet_ids = ['0', '1', '2', '3', '4']
    data = xr.DataArray(random_array,
                        coords={
                            'snippet': snippet_ids,
                            'symbol': ['window.navigator', 'canvas.context'],
                        },
                        dims=('snippet', 'symbol'))
    f = ds.dye_score_data_file('snippets')
    data.to_dataset(name='data').to_zarr(store=ds.get_zarr_store(f))

    # Run Test
    with pytest.raises(KeyError):
        ds.compute_distances_for_dye_snippets(
            ['2'],
            override=True,
            distance_function='euclidean',
        )