Beispiel #1
0
 def find_neighbors_df(
     self,
     df: pd.DataFrame,
     unique_id_column: AnyStr,
     feature_columns: List[AnyStr],
     index_array_ids: np.array,
     num_neighbors: int = 5,
     **kwargs,
 ) -> pd.DataFrame:
     """Find nearest neighbors in a raw pandas DataFrame and format results into a new DataFrame"""
     output_df = pd.DataFrame()
     output_df[self.INPUT_COLUMN_NAME] = df[unique_id_column]
     data_loader = DataLoader(unique_id_column, feature_columns)
     (array_ids, arrays) = data_loader.convert_df_to_arrays(df,
                                                            verbose=False)
     if arrays.shape[1] != self.num_dimensions:
         raise ValueError(
             "Incompatible number of dimensions: " +
             f"{self.num_dimensions} in index, {arrays.shape[1]} in feature column(s)"
         )
     output_df["index_distance_pairs"] = self.find_neighbors_array(
         arrays, num_neighbors)
     output_df = output_df.explode("index_distance_pairs")
     output_df[self.NEIGHBOR_COLUMN_NAME] = output_df[
         "index_distance_pairs"].apply(lambda x: int(x[0]))
     output_df[self.DISTANCE_COLUMN_NAME] = output_df[
         "index_distance_pairs"].apply(lambda x: float(x[1]))
     output_df[self.NEIGHBOR_COLUMN_NAME] = (
         output_df[self.NEIGHBOR_COLUMN_NAME].astype(int).apply(
             lambda i: index_array_ids[i]))  # lookup the original array ids
     del output_df["index_distance_pairs"]
     return output_df
def test_find_neighbors_df():

    params = {
        'unique_id_column': 'images',
        'feature_columns': ['prediction'],
        'algorithm': 'annoy',
        'expert': True,
        'annoy_metric': 'angular',
        'annoy_num_trees': 10
    }

    index_config = {
        'algorithm': 'annoy',
        'num_dimensions': 2048,
        'annoy_metric': 'angular',
        'annoy_num_trees': 10,
        'feature_columns': ['prediction'],
        'expert': True
    }

    # Load data into array format for indexing
    columns = [params["unique_id_column"]] + params["feature_columns"]
    input_df = pd.read_csv('./tests/resources/caltech_embeddings.csv')
    input_df = input_df[columns]
    data_loader = DataLoader(params["unique_id_column"],
                             params["feature_columns"])
    (array_ids, arrays) = data_loader.convert_df_to_arrays(input_df)
    nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1],
                                             **params)
    with NamedTemporaryFile() as tmp:
        nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name)
        params = {
            'unique_id_column': 'images',
            'feature_columns': ['prediction'],
            'num_neighbors': 5
        }
        nearest_neighbor = NearestNeighborSearch(**index_config)
        nearest_neighbor.load_index(tmp.name)
        # Find nearest neighbors in input dataset
        df = nearest_neighbor.find_neighbors_df(input_df,
                                                **params,
                                                index_array_ids=array_ids)
        actual = sorted(
            list(df[df['input_id'] == '34719_ostrich.jpg']['neighbor_id']))
        expected = [
            '107505_ostrich.jpg', '185189_ostrich.jpg', '213657_ostrich.jpg',
            '229350_ostrich.jpg', '34719_ostrich.jpg'
        ]
    assert len(actual) == len(expected)
    assert all([
        actual_item == expected_item
        for actual_item, expected_item in zip(actual, expected)
    ])
def test_build_save_index():

    params = {
        'unique_id_column': 'images',
        'feature_columns': ['prediction'],
        'algorithm': 'annoy',
        'expert': True,
        'annoy_metric': 'angular',
        'annoy_num_trees': 10
    }

    # Load data into array format for indexing
    columns = [params["unique_id_column"]] + params["feature_columns"]
    input_df = pd.read_csv('./tests/resources/caltech_embeddings.csv')
    # Restrict to selected columns
    input_df = input_df[columns]
    data_loader = DataLoader(params["unique_id_column"],
                             params["feature_columns"])
    (array_ids, arrays) = data_loader.convert_df_to_arrays(input_df)
    nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1],
                                             **params)
    with NamedTemporaryFile() as tmp:
        nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name)
        assert os.path.isfile(tmp.name)
import os
from tempfile import NamedTemporaryFile

from dku_param_loading import load_indexing_recipe_params
from data_loader import DataLoader
from nearest_neighbor.base import NearestNeighborSearch
from dku_io_utils import save_array_to_folder

# Load parameters
params = load_indexing_recipe_params()

# Load data into array format for indexing
columns = [params["unique_id_column"]] + params["feature_columns"]
input_df = params["input_dataset"].get_dataframe(columns=columns, infer_with_pandas=False)
data_loader = DataLoader(params["unique_id_column"], params["feature_columns"])
(array_ids, arrays) = data_loader.convert_df_to_arrays(input_df)

# Build index and save index file to output folder
nearest_neighbor = NearestNeighborSearch(num_dimensions=arrays.shape[1], **params)
with NamedTemporaryFile() as tmp:
    nearest_neighbor.build_save_index(arrays=arrays, index_path=tmp.name)
    index_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.INDEX_FILE_NAME)
    params["index_folder"].upload_stream(index_file_path, tmp)

# Save arrays and indexing config to guarantee reproducibility
array_ids_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.ARRAY_IDS_FILE_NAME)
arrays_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.ARRAYS_FILE_NAME)
config_file_path = os.path.join(params["folder_partition_root"], nearest_neighbor.CONFIG_FILE_NAME)
save_array_to_folder(array=array_ids, path=array_ids_file_path, folder=params["index_folder"])
save_array_to_folder(array=arrays, path=arrays_file_path, folder=params["index_folder"])
config = {**nearest_neighbor.get_config(), **{k: v for k, v in params.items() if k in {"feature_columns", "expert"}}}