Beispiel #1
0
def test_orchestrator(nodes_list, cores, n, d, m_out, L_out, m_in, L_in, k,
                      alpha):
    """
        Test the functioning of the Orchestrator communicating to a node.
        It assumes distributed_test.py is already running and waiting locally.
        This function tests the system in prediction mode.
    """

    synchronous = True

    # The dataset is a unit matrix of size D.
    X = np.eye(d)
    labels = np.ones(80, dtype=int)
    labels[21] = 0

    # Create query and expected result.
    x = X[21]
    queries = [
        Query(x * 2),
        Query(x * 1.5),
        Query(X[15] * 1.1),
        Query(X[12] * 1.12)
    ]
    query_labels = np.array(
        [0, 0, 1, 0],
        dtype=int)  # Fourth query is intentionally a false positive.

    # Execute parallel code.
    table_log, queries = execute_middleware(
        nodes_list,
        cores,
        n,
        d,
        k,
        queries=queries,
        X=X,
        labels=labels,
        synchronous=synchronous,
        prediction=True)

    accuracy = compute_accuracy(queries, query_labels)
    print("The prediction accuracy is: {}".format(accuracy))

    recall = compute_recall(queries, query_labels)
    print("The recall is: {}".format(recall))

    mcc = compute_mcc(queries, query_labels)
    print("The mcc is: {}".format(mcc))

    execute_middleware_logging(
        ("testlog", len(nodes_list), cores),
        queries,
        table_log,
        accuracy=accuracy,
        recall=recall,
        mcc=mcc,
        accparameters=(m_out, L_out, m_in, L_in, alpha, n, k))
Beispiel #2
0
    def test_NN(self):
        # Test a query's NN corresponds to what it should be.
        D = 50
        H_out = L1LSH([5] * D)
        H_in = COSLSH(D)

        X_matrix = np.eye(D)  # The dataset is a unit matrix of size D.
        X = np.reshape(np.transpose(X_matrix), D * D).tolist()
        X_shape = (D, D)

        m_out = 10
        L_out = 10
        m_in = 10
        L_in = 5
        k = 1
        alpha = 0.2

        (T_out, g_out, T_in, g_in) = slsh_indexing(m_out, L_out, m_in, L_in, X,
                                                   X_shape, H_out, H_in, alpha)

        x = np.array(X[20 * X_shape[0]:21 * X_shape[0]])
        query = Query(x * 2)

        selector = NearestPoints(X=X, X_shape=X_shape)
        nn = slsh_querying(query, T_out, g_out, T_in, g_in, k, selector)

        self.assertTrue(np.array_equal(X_matrix[nn[0]], x))
Beispiel #3
0
def middleware_distributed_gaussian_test(d, n, nodes_list, cores, synchro):

    # The dataset is drawn from a isotropic gaussian.
    filename = "./datasets/gaussian_{}x{}".format(d, n)

    # Queries generation.
    k = 1
    n_queries = 100
    mean = 0
    std = 20
    queries = [
        Query(np.random.normal(mean, std, d)) for i in range(n_queries)
    ]  # Generate random gaussian queries as the dataset points.

    # Execute parallel code.
    table_log, queries = execute_middleware(nodes_list,
                                            cores,
                                            n,
                                            d,
                                            k,
                                            queries=queries,
                                            filename=filename,
                                            synchronous=synchro)

    execute_middleware_logging(
        ("distributed-gaussian{}x{}_scaling_test".format(
            d, n), len(nodes_list), cores), queries, table_log)

    return
Beispiel #4
0
    def test_prediction_system(self):
        """
            Test the functioning of the system communicating to a node.
            It assumes networking_tester.py is already running and waiting locally.
            This function tests the system in prediction mode.
        """

        # For networking tester:
        # nodes_list = [("127.0.0.1", 1025), ("127.0.0.1", 1035)]
        # For worker_node/distributed_test:
        nodes_list = [("127.0.0.1", 1025)]
        cores = 2
        synchronous = True

        # The dataset is a unit matrix of size D.
        D = 80
        X = np.eye(D)
        labels = np.ones(80, dtype=int)
        labels[21] = 0

        # Create query and expected result.
        x = X[21]
        query1 = Query(x * 2)
        query2 = Query(x * 1.5)
        k = 1

        # Execute parallel code.
        temp1, queries = execute_middleware(nodes_list,
                                            cores,
                                            D,
                                            D,
                                            k,
                                            queries=[query1, query2],
                                            X=X,
                                            labels=labels,
                                            synchronous=synchronous,
                                            prediction=True)

        self.assertTrue(np.array_equal(queries[0].neighbors[0], x))
        self.assertTrue(np.array_equal(queries[0].neighbors_labels[0], 0))
Beispiel #5
0
def parallel_gaussian_test(d, n, cores):
    '''
    Execute speed test on a single node with cores cores.
    d is the dimensionality of the dataset, n the number of points to generate.
    The generated dataset is a isotropic gaussian.

    :param d: dataset dimensionality
    :param n: number of datapoints

    :return: nothing
    '''

    # Data generation.
    mean = 0
    std = 20
    X_shape = (d, n)
    X = np.random.normal(mean, std, X_shape)  # Generate dataset.

    # SLSH parameters.
    m_out = 50 * 2
    L_out = 24 * 4  # It has to be a multiple of 24 for it to scale decently.
    m_in = 30
    L_in = 10
    k = 1
    alpha = 0.1
    H_out = L1LSH([(-100, 100)] * d)
    H_in = COSLSH(d)

    # Queries generation.
    n_queries = 100
    queries = [
        Query(np.random.normal(mean, std, d)) for i in range(n_queries)
    ]  # Generate random gaussian queries as the dataset points.

    # Execute algorithm.
    table_log, queries = execute_node(cores,
                                      k,
                                      m_out,
                                      L_out,
                                      m_in,
                                      L_in,
                                      H_out,
                                      H_in,
                                      alpha,
                                      X=X,
                                      queries=queries)

    # Log output.
    execute_node_logging(("gaussian{}x{}_scaling_test".format(d, n), cores),
                         queries, table_log)

    return
Beispiel #6
0
def get_dataset_and_queries_from_pickles(filename):
    """
    Given the filename of the original dataset, returns (as a tuple of four elements):
    - the dataset in numpy matrix form
    - the dataset labels as numpy array
    - the list of queries as numpy arrays
    - the query labels as numpy array

    :param filename: name of the original dataset
    :return: (dataset, dataset labels, queries, queries labels)
    """

    with open("datasets/" + filename[:len(filename) - 5] + "-dataset.pickle",
              'rb') as file:
        dataset = pickle.load(file)

    with open("datasets/" + filename[:len(filename) - 5] + "-queries.pickle",
              'rb') as file:
        querylist = pickle.load(file)

    n = len(dataset)
    d = len(dataset[0][0])

    # Convert to numpy.
    X = np.empty((d, n))
    labels = np.empty(n)
    for i in range(len(dataset)):
        point = dataset[i][0]
        labels[i] = dataset[i][1]
        X[:, i] = np.array(point)

    n_queries = len(querylist)

    queries = []
    query_labels = np.empty(n_queries, dtype=int)
    for i in range(len(querylist)):
        queries.append(Query(querylist[i][0]))
        query_labels[i] = querylist[i][1]

    return X, labels, queries, query_labels
Beispiel #7
0
    def test_L1_NN(self):
        # Test the output is the correct NN.
        D = 50
        H = L1LSH([5] * D)
        X1_matrix = np.eye(D)  # The dataset is a unit matrix of size D.
        X1 = np.reshape(np.transpose(X1_matrix), D * D).tolist()
        X1_shape = (D, D)

        m = 10
        L = 10
        k = 1

        points = range(D)  # Hash all the points.
        (g, T1) = lsh_indexing(m, L, X1, X1_shape, points, H)

        x = np.array(X1[10 * X1_shape[0]:11 * X1_shape[0]])
        query = Query(x * 1.0001)

        selector = NearestPoints(X=X1, X_shape=X1_shape)
        nn = lsh_querying(query, T1, g, k, selector)

        self.assertTrue(np.array_equal(X1_matrix[nn[0]], x))
Beispiel #8
0
    def test_parallel_NN(self):
        # Test a query's NN corresponds to what it should be, on 2 cores.
        cores = 3
        D = 80
        H_out = L1LSH([(-1, 1)] * D)
        H_in = COSLSH(D)

        X = np.eye(D)  # The dataset is a unit matrix of size D.

        m_out = 50
        L_out = 50
        m_in = 20
        L_in = 10
        k = 1
        alpha = 0.01

        # Create query and expected result.
        x = X[21]
        query = Query(x * 2)

        # Execute parallel code.
        temp1, queries = execute_node(cores,
                                      k,
                                      m_out,
                                      L_out,
                                      m_in,
                                      L_in,
                                      H_out,
                                      H_in,
                                      alpha,
                                      X=X,
                                      queries=[query])

        print("In: {}".format(x * 2))
        print("Out: {}".format(queries[0].neighbors[0]))

        self.assertTrue(np.array_equal(queries[0].neighbors[0], x))
Beispiel #9
0
def send_by_contiguous_slices(n, d, nodes, X=[], labels=[], filename=""):
    """
    Send the dataset by slices from file. Slicing is done by separating contiguous portions of the file.
    Only 1/len(nodes) of the file file fit into memory.
    In the file, each point occupies one line and the elements of the point are space-separated.

    :param filename: the name of the file to read the dataset from (optional).
    :param n: the number of points in the dataset.
    :param d: the dimensionality of the dataset.
    :param X: the dataset as numpy matrix (if not provided, data is read from the filename).
    :param labels: the labels of the dataset points (if not provided, data is read from the filename).

    :return: nothing.
    """

    queries = []

    # Read dataset from file.
    if filename != "":
        # Queries retrieval.
        n_queries = 2000
        n = n - n_queries
        query_indices = np.sort(
            np.random.choice(n, size=n_queries, replace=False))
        query_labels = []

        p = len(nodes)  # Number of nodes.
        slice_size = int(ceil(float(n) / p))
        remainder_size = n % slice_size

        with open(filename, "r") as file:
            counter = 0
            slice_index = 0
            query_index = 0
            line_number = 0
            # allocate numpy array containing the slice in which a point is a column.
            c_slice = np.empty((d, slice_size))

            for line in file:

                point_string = line.split(" ")
                point = np.array([float(x) for x in point_string])

                if query_index < n_queries:
                    if line_number == query_indices[query_index]:
                        queries.append(Query(point))
                        query_index += 1
                        line_number += 1
                        continue

                c_slice[:, counter] = point
                counter += 1

                # If the slice is full, send it to the nodes.
                if counter == slice_size:
                    send_slice(c_slice, nodes[slice_index], labels_slice=[])
                    slice_index += 1
                    counter = 0
                    if slice_index == p - 1:
                        slice_size = remainder_size
                    c_slice = np.empty((d, slice_size))

                line_number += 1

    # Use provided matrix, permute it and send it to the nodes.
    else:
        prediction = (labels != [])

        n = np.shape(X)[1]
        p = len(nodes)  # Number of nodes.
        slice_size = int(ceil(float(n) / p))

        # Permute matrix.
        permuted_indices = np.random.permutation(n)
        X = X[:, permuted_indices]
        if prediction:
            labels = labels[permuted_indices]

        for i in range(len(nodes)):
            c_labels = []
            if i != p - 1:
                c_slice = X[:, i * slice_size:(i + 1) * slice_size]
                if prediction:
                    c_labels = labels[i * slice_size:(i + 1) * slice_size]
            else:
                c_slice = X[:, i * slice_size:]
                if prediction:
                    c_labels = labels[i * slice_size:]

            send_slice(c_slice, nodes[i], labels_slice=c_labels)

    return queries