Beispiel #1
0
def find_edges(input, test, K):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC = 30, 100
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        
        space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md
        space_name = space_names[0]
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        '''
        def calc_zero_rows(i):
            if input[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0])))
        print(f"# zero rows in input = {zero_row_num}", end=" ")
        '''
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params, print_progress=True)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params, end=" ")
        tree.setQueryTimeParams(query_time_params)
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        '''
        def calc_zero_rows2(i):
            if test[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0])))
        print(f"# zero rows in test = {zero_row_num}")
        '''

        indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}"
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"\tget edges done! .... time={time.time()-st_time:.3f}s")
    return edge_list
class AKNNPredictor:
    def __init__(self, params):
        self.logFile = params['logFile']
        self.seed = params['seed']

    def Train(self, X, Y, numThreads=1):
        assert (X.shape[0] == Y.shape[0])

        if issparse(X):
            # The python interface of nmslib library most probably does not support sparse input
            # Use KDTree of sklearn package
            print(
                str(datetime.now()) + " : " +
                "Creating Approximate KNN graph over train examples using sklearn functions"
            )
            self.graph = NearestNeighbors(n_neighbors=10,
                                          radius=5,
                                          algorithm='auto',
                                          metric='l2',
                                          n_jobs=numThreads)
            self.graph.fit(X)
        else:
            print(
                str(datetime.now()) + " : " +
                "Creating Approximate KNN graph over train examples using HANN"
            )
            self.graph = nmslib.init(method='hnsw', space='l2')
            self.graph.addDataPointBatch(X)
            self.graph.createIndex({
                'post': 2,
                'M': 10,
                'maxM0': 20
            },
                                   print_progress=False)

        self.Y = Y

    def Predict(self, Xt, nnTest, numThreads=1):
        # Compute K nearest neighbors for input data
        print(str(datetime.now()) + " : " + "Computing Approximate KNN")
        knn = self.ComputeAKNN(Xt, nnTest, numThreads)

        # Predict labels for input data
        print(str(datetime.now()) + " : " + "Performing prediction")
        predYt = self.ComputeLabelScore(knn, nnTest, numThreads)

        return predYt

    def ComputeLabelScore(self, KNN, nnTest, numThreads=1):
        if (KNN.shape[0] == 0):
            return lil_matrix((0, self.Y.shape[1]), dtype=float)

        Y = self.Y
        nt = KNN.shape[0]
        L = Y.shape[1]
        batchSize = int(math.ceil(float(nt) / numThreads))
        numBatches = int(math.ceil(float(nt) / batchSize))
        startIdx = [i * batchSize for i in range(numBatches)]
        endIdx = [min((i + 1) * batchSize, nt) for i in range(numBatches)]

        numCores = numThreads
        resultList = Parallel(n_jobs=numCores)(
            delayed(ComputeLabelScoreInner)(Y, KNN[s:e, :], nnTest)
            for s, e in zip(startIdx, endIdx))
        predYt = vstack(resultList, format='lil')

        assert (predYt.shape[0] == nt)
        return predYt

    def ComputePrecision(self, predYt, Yt, K, numThreads):
        assert (predYt.shape == Yt.shape)
        if (predYt.shape[0] == 0):
            return np.zeros((K), dtype=float)

        nt, L = Yt.shape
        batchSize = int(math.ceil(float(nt) / numThreads))
        numBatches = int(math.ceil(float(nt) / batchSize))
        startIdx = [i * batchSize for i in range(numBatches)]
        endIdx = [min((i + 1) * batchSize, nt) for i in range(numBatches)]

        resultList = Parallel(n_jobs=numThreads)(
            delayed(ComputePrecisionInner)(predYt[s:e, :], Yt[s:e, :], K)
            for s, e in zip(startIdx, endIdx))
        precision = np.zeros((K, 1))
        for i, res in enumerate(resultList):
            precision += res * (endIdx[i] - startIdx[i])
        precision /= float(nt)
        return precision

    def ComputeAKNN(self, Xt, nnTest, numThreads=1):
        if (Xt.shape[0] == 0):
            return np.zeros((0, nnTest), dtype=np.int64)
        if (issparse(Xt)):
            KNN = self.graph.kneighbors(Xt,
                                        min(nnTest, Xt.shape[0]),
                                        return_distance=False)
            if (KNN.shape[1] < nnTest):
                rf = int(nnTest / KNN.shape[1])
                KNN = np.hstack(tuple([KNN] * rf))
                KNN = np.hstack((KNN, KNN[:, :(nnTest - KNN.shape[1])]))
        else:
            neighbors = self.graph.knnQueryBatch(Xt,
                                                 min(nnTest, Xt.shape[0]),
                                                 num_threads=numThreads)
            # Create the KNN matrix
            KNN = np.zeros((Xt.shape[0], nnTest), dtype=np.int64)
            for i, nei in enumerate(neighbors):
                nn = nei[0].shape[1]
                KNN[i, :nn] = nei[0]
                if (nn < nnTest):
                    for j in range(nn, nnTest):
                        KNN[i, j] = nei[0][j % nn]
        return KNN

    def PredictAndComputePrecision(self, Xt, Yt, nnTestList, maxTestSamples,
                                   numThreads):
        assert (Xt.shape[0] == Yt.shape[0])

        # Perform down sampling of input data
        if (maxTestSamples > 0):
            Xt, Yt, testSample = DownSampleData(Xt, Yt, maxTestSamples)

        maxNNTest = max(nnTestList)
        # Compute K nearest neighbors for input data
        print(str(datetime.now()) + " : " + "Computing KNN")
        knn = self.ComputeAKNN(Xt, maxNNTest, numThreads)

        resList = []
        for nnTest in nnTestList:
            # Predict labels for input data
            print(
                str(datetime.now()) + " : " +
                "Performing prediction for nnTest = " + str(nnTest))
            predYt = self.ComputeLabelScore(knn, nnTest, numThreads)

            # Compute precisions for input data
            print(
                str(datetime.now()) + " : " +
                "Computing precisions for nnTest = " + str(nnTest))
            precision = self.ComputePrecision(predYt, Yt, 5, numThreads)
            #resList.append({'Y': Yt, 'predY': predYt, 'scoreY': scoreYt, 'precision': precision, 'testSample': testSample})
            resList.append({'precision': precision})

        return resList

    def UpdateLogFile(self, logFile):
        self.logFile = logFile

    def UpdateSeed(self, seed):
        self.seed = seed
Beispiel #3
0
def find_edges(input, test, K, cluster_ids, query_ids):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC, num_threads = 30, 100, 10
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        space_name = 'cosinesimil_sparse'
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ")
        
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params)
        tree.setQueryTimeParams(query_time_params)

    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((query_ids[index1], center_ids[index2]))
    print(f"\tdone! .... time={time.time()-st_time:.3f}s")
    return edge_list