Beispiel #1
0
def text_hist():
    """
    Calculate histogram of text of images
    """
    with open('data/sift_names.pkl', 'r') as f:
        names = cPickle.load(f)
    with open('data/sift_hist.pkl', 'r') as f:
        sift_hists = cPickle.load(f)
    filenames = []
    for name in names:
        name = name.replace('img', 'descr')
        name = name.replace('.jpg', '.txt')
        filenames.append('shopping/images/' + name)
    vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
    xall_transformed = vectorizer.fit_transform(filenames).tocsr()
    preprocessing.normalize(xall_transformed, copy=False)

    lamb = .5
    hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
    preprocessing.normalize(hists, copy=False)
    model = LSHForest()
    model.fit(hists)
    with open('data/text_hist.pkl', 'w') as f:
        cPickle.dump(xall_transformed, f)
    with open('data/vectorizer.pkl', 'w') as f:
        cPickle.dump(vectorizer, f)
    with open('data/lshforest_combine.pkl', 'w') as f:
        cPickle.dump(model, f)
Beispiel #2
0
 def fit_lsh(self):
     self.lsh = LSHForest(random_state=12345)
     train_data = [
         self.encode_sentence(self.indexed_background[i], True)
         for i in range(len(self.indexed_background))
     ]
     self.lsh.fit(train_data)
Beispiel #3
0
def knn_indices_func_approx(
        rep_pts: FloatTensor,  # (N, pts, dim)
        pts: FloatTensor,  # (N, x, dim)
        K: int,
        D: int) -> LongTensor:  # (N, pts, K)
    """
    Approximate CPU-based Indexing function based on K-Nearest Neighbors search.
    :param rep_pts: Representative points.
    :param pts: Point cloud to get indices from.
    :param K: Number of nearest neighbors to collect.
    :param D: "Spread" of neighboring points.
    :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:]
    is the set k-nearest neighbors for the representative points in pts[n].
    """
    if rep_pts.is_cuda:
        rep_pts = rep_pts.cpu()
    if pts.is_cuda:
        pts = pts.cpu()
    rep_pts = rep_pts.data.numpy()
    pts = pts.data.numpy()

    region_idx = []

    for n, p in enumerate(rep_pts):
        P_particular = pts[n]
        lshf = LSHForest(n_estimators=20,
                         n_candidates=100,
                         n_neighbors=D * K + 1)
        lshf.fit(P_particular)
        indices = lshf.kneighbors(p, return_distance=False)
        region_idx.append(indices[:, 1::D])
Beispiel #4
0
class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """
    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m,
                                                          n_neighbors=nn + 1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest

    def get_node_to_word(self):
        return self.iw
Beispiel #5
0
def lof(X, k, outlier_threshold=1.5, verbose=False):
    """Knn with KD trees"""
    start = time.time()
    lshf = LSHForest(random_state=42)
    lshf.fit(X)

    distance, index= lshf.kneighbors(X,n_neighbors=k)
    distance, index = distance[:, 1:], index[:, 1:]
    radius = distance[:, -1]

    """Calculate LRD."""
    LRD = np.mean(np.maximum(distance, radius[index]), axis=1)
    r = 1. / np.array(LRD)

    """Calculate outlier score."""
    outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16)
    outlier_score *= 1. / k

    # print ('Compute time: %g seconds.' % ((time.time() - start)))

    if verbose: print("Recording all outliers with outlier score greater than %s." \
                      % (outlier_threshold))

    outliers = []
    """ Could parallelize this for loop, but really not worth the overhead...
        Would get insignificant performance gain."""
    for i, score in enumerate(outlier_score):
        if score > outlier_threshold:
            outliers.append([i,X[i], score])

    if verbose:
        print("Detected outliers:")
        print(outliers)

    return outliers
Beispiel #6
0
class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """

    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init 
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest
    
    def get_node_to_word(self):
        return self.iw
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Beispiel #9
0
 def __init__(self, lsh_init=None):
     if lsh_init == None:
         self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
     else:
         self._lsh_forest = lsh_init
     self.iw = None
     self.m = None
Beispiel #10
0
def train():
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)
    sku_names_with_spaces = []
    for sku_names in sku_names_jieba:
        sku_names_with_spaces.append(' '.join(sku_names))

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)
    keywords_with_spaces = []
    for keywords in keywords_jieba:
        keywords_with_spaces.append(' '.join(keywords))

    tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1)
    x_train = tfidf_vec.fit_transform(sku_names_with_spaces)

    lshf = LSHForest(random_state=42)
    #lshf.fit(np.array(x_train))
    lshf.fit(x_train)

    for i, kw in enumerate(keywords_with_spaces):
        x_test = tfidf_vec.transform([kw])
        distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1)
        idx = indices[0][0]
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf:
            wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
Beispiel #11
0
class Index(BaseIndex):
    """ LSH Forest Index
    """

    name = 'lsh_forest'

    def _fit(self, xs):
        """ Fit index
        :param samples: list of Samples
        :return:
        """
        self.index = LSHForest(
            n_estimators=self.parameters.get('n_estimators', 20))
        self.index.fit(xs)

    def _query(self, sample, k=5, **kwargs):
        """ Query index
        :param sample: Sample
        :param k:
        :param kwargs:
        :return:
        """
        x, _, = self.transform([sample])
        distances, idxs = self.index.kneighbors(x, n_neighbors=k + 1)
        neighbors = []
        for idx, d in zip(idxs[0], distances[0]):
            hashval = self.ys[idx]
            neighbors.append({
                'hashval': hashval,
                'similarity': min(1 - float(d), 1.0)
            })
        return neighbors
class ScikitLearnLsh(NearestNeighborAlgorithm):
    """
    This ``NearestNeighborAlgorithm`` uses scikit-learn's implementation of a locality sensitive
    hash to find approximate nearest neighbors.

    Parameters
    ----------
    random_state: int, optional (default=12345)
        Used to initialize the LSHForest, so that runs are consistent.
    """
    def __init__(self, params: Dict[str, Any]):
        random_state = params.pop('random_state', 12345)
        self.lsh = LSHForest(random_state=random_state)

    def fit(self, vectors: List[numpy.array]):
        logger.info("Fitting LSH with %d vectors", len(vectors))
        self.lsh.fit(vectors)

    def get_neighbors(self, query_vector: numpy.array,
                      num_neighbors: int) -> List[Tuple[int, float]]:
        if len(query_vector.shape) == 1:
            query_vector = [query_vector]
        logger.info("Getting neighbors for %d vectors", len(query_vector))
        scores, neighbor_indices = self.lsh.kneighbors(
            query_vector, n_neighbors=num_neighbors)
        logger.info("Neighbors retrieved")
        result = [
            zip(neighbor_indices[i], scores[i])
            for i in range(len(neighbor_indices))
        ]
        if len(result) == 1:
            result = result[0]
        return result
def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector}
    print("Creating LSHForest...")
    catArray = numpy.array(list(categories.values()))
    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")
    return lshf
def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})
def Classify(nlp, keywords,
             categories):  #keywords  - list; categories - dict: {name; vector}
    counterDict = Counter(keywords)  #optimization for keywords duplicates
    sumVector = numpy.zeros(nlp.vocab.vectors_length)

    #temp
    text = ' '.join(keywords)

    for word, repCount in counterDict.items():  #summurizing words vectors
        curVect = nlp(word).vector
        sumVector += (curVect * repCount)

    vec = nlp(text).vector
    sim = cosine_similarity(vec, sumVector)
    print("Sim: " + str(sim))

    catArray = numpy.array(list(categories.values()))
    catKeys = list(categories.keys())
    #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity)
    #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1)

    print("Creating LSHForest...")

    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")

    print("Getting neighbors...")
    distances, indices = lshf.kneighbors(sumVector.reshape((1, -1)))
    print("Got neighbors.")

    for curIndex in numpy.nditer(indices):
        print("Found category: " + str(catKeys[curIndex]))
        print("with distance: " + str(distances))
Beispiel #16
0
def persist_attraction_similarities_to_db():
    # build LSHForest model for reduced dimension dataset
    svd = TruncatedSVD(n_components=10, n_iter=7)
    red_dim_itemuserdf = svd.fit_transform(itemuserdf)
    item_user_model = LSHForest()
    item_user_model.fit(red_dim_itemuserdf)

    # persist attractions similarities to db
    K=20        # query for K neighbors
    k=10        # return k neighbors
    for i in range(itemuserdf.shape[0]):
        distance, indices = item_user_model.kneighbors(
            red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K
        )
        weights = 1 - distance
        for j in range(k):
            if i != indices[0][j]:
                e = SimilarAttractions(
                    attraction_id=Attraction.objects.filter(
                        app_id=int(i)).values('attraction_id')[0]['attraction_id'],
                    similar_attraction_id=Attraction.objects.filter(
                        app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'],
                    similarity=weights[0][j],
                    ts=timezone.now()
                )
                e.save()
Beispiel #17
0
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
    def get_nearest_neighbor_iterable(self,
                                      graphlist,
                                      start_graphs,
                                      start_is_subset=True):

        # vectorize all
        graphlist = list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)

        start_graphs = list(start_graphs)
        graphlist_ = copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)

        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1

        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index])
                   for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi, Yi, dist in matches:
            yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Beispiel #20
0
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0,
                                              np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
Beispiel #21
0
def score(factors):
    verifyCount = 3
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = FactorizeVectors(X, factors)
    test_set = FactorizeVectors(test_set, factors)
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for j in range(len(test_set)):
            total += 1
            actual = databases[j]
            distances, indices = clf.kneighbors(test_set[j], n_neighbors=5)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1

        if (correct > best_predictions):
            best_predictions = correct
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage
def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i):
    start = time.time()
    tree = LSHForest(random_state=42)
    tree.fit(data)
    end = time.time()

    return sys.getsizeof(tree), (end - start)
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
Beispiel #25
0
    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist
Beispiel #26
0
    def optimise(self, num_train_points, num_val_points, parameters):

        max_accuracy = -1
        optimal_estimators = -1
        optimal_n_neighbours = -1

        for item in self.get_generator(parameters):

            LSHf = LSHForest(random_state=42,
                             n_estimators=item['n_est'],
                             n_neighbors=item['n_neigh'])
            LSHf.fit(self.train.images[:num_train_points])
            distances, indices = LSHf.kneighbors(
                self.validation.images[:num_val_points], n_neighbors=5)

            accuracy, positions = self.model_accuracy(indices,
                                                      is_optimising=True)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                optimal_estimators = item['n_est']
                optimal_n_neighbours = item['n_neigh']

#         print(optimal_n_neighbours_predict)
        return max_accuracy, optimal_estimators, optimal_n_neighbours
Beispiel #27
0
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius=.4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices
    def __init__(self, params: Dict[str, Any]):
        # Location of corpus to use for background knowledge search. This corpus is assumed to be
        # gzipped, one sentence per line.
        self.corpus_path = params.pop('corpus_path', None)

        # Number of background sentences to collect for each input.
        self.num_background = params.pop('num_background', 10)
        # Wait this many epochs before running differentiable search. This lets you train with the
        # base memory network code using external background knowledge for a time, then, once the
        # encoder is trained sufficiently, you can turn on the differentiable search.
        self.num_epochs_delay = params.pop('num_epochs_delay', 10)

        # Number of epochs we wait in between re-encoding the corpus.
        # TODO(matt): consider only re-encoding at early stopping, instead of a
        # number-of-epoch-based parameter.
        self.num_epochs_per_encoding = params.pop('num_epochs_per_encoding', 2)

        # Only meaningful if you are loading a model.  When loading, should we load a pickled LSH,
        # or should we re-initialize the LSH from the input corpus?  Note that if you give a corpus
        # path, and you load a saved LSH that was constructed from a _different_ corpus, you could
        # end up with really weird behavior.
        self.load_saved_lsh = params.pop('load_saved_lsh', False)

        # Now that we've popped our parameters, we can call the superclass constructor.
        super(DifferentiableSearchMemoryNetwork, self).__init__(params)

        # And then set some member variables.
        self._sentence_encoder_model = self.__build_sentence_encoder_model()
        self.lsh = LSHForest(random_state=12345)
        self.instance_index = {}  # type: Dict[int, str]
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0, np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
Beispiel #31
0
def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query,
                                        n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Beispiel #32
0
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
    lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
                     n_neighbors=n_neighbors, random_state=seed)
    t0 = time()
    lshf.fit(data)
    duration = time() - t0
    return lshf, duration
Beispiel #33
0
    def fit_model(self, data, n_estimators, n_neighbours):

        LSHf = LSHForest(random_state=42,
                         n_estimators=n_estimators,
                         n_neighbors=n_neighbours)
        LSHf.fit(data)
        return LSHf
Beispiel #34
0
 def create_tree(self,listNames,variableName):
     #LSHForest. only once for the main database
     lshf = LSHForest(n_estimators=50,n_candidates=500)
     TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
     lshf.fit(tfidfs)        
     pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
Beispiel #35
0
 def _fit(self, xs):
     """ Fit index
     :param samples: list of Samples
     :return:
     """
     self.index = LSHForest(
         n_estimators=self.parameters.get('n_estimators', 20))
     self.index.fit(xs)
def hash_movie_similarity(um, num_neighbors=6):
    lsh = LSHForest(random_state=470957)
    lsh.fit(um.T)

    # Don't compare to self, remove first column, call 7 neighbors
    dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True)
    sim = 1 - dist
    return sim[:,1:], ind[:,1:]
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
Beispiel #38
0
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
Beispiel #39
0
class LSH_KNN:
    def __init__(self, weights='uniform', **kwargs):
        self.n_neighbors = kwargs['n_neighbors']
        self.lsh = LSHForest(**kwargs)
        self.weights = weights

    def fit(self, X, y):
        self.y = y
        self.X = X
        self.lsh.fit(X)

    def predict_top_n(self, test_X, n):
        _, indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        votes = np.zeros((len(test_X), n))
        for i in range(len(indices)):
            votes[i] = np.bincount([self.y[j] for j in indices[i]]).argsort()[-n:][::-1]
        return votes.astype(int)

    def predict_proba(self, test_X, return_dists=False):
        # SMOOTHING PARAMETER TO PREVENT 0 PROBA; https://stats.stacketest_xchange.com/questions/83600/how-to-obtain-the-class-conditional-probability-when-using-knn-classifier
        s = 0.1
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        dists = []
        proba = np.zeros((len(test_X), np.amatest_x(self.y) + 1))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights, minlength=np.amatest_x(self.y)+1)
            proba[test_point] = np.true_divide(weighted_class_counts + s, np.sum(weighted_class_counts) + len(weighted_class_counts)*s)
            if return_dists:
                test_point_dists = {}
                for neighbor_index in neighbor_indices[test_point]:
                    if self.y[neighbor_index] not in test_point_dists:
                        self.y[neighbor_index] = []
                    test_point_dists[self.y[neighbor_index]].append(dist(test_X[test_point], self.X[neighbor_index]))
                dists.append(test_point_dists)
        if return_dists:
            return proba, dists
        return proba

    def predict(self, test_X):
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        result = np.zeros(len(test_X))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights)
            result[test_point] = np.argmatest_x(weighted_class_counts)
        return result.astype(int)

    def dist(self, a, b):
        return np.linalg.norm(a - b)
Beispiel #40
0
def trainLSH(train, test, val):
    n_feat = train[0].size
    train_data = train[:, :-1]
    train_labels = train[:, n_feat - 1]
    val_data = val[:, :-1]
    val_labels = val[:, n_feat - 1]
    lshf = LSHForest(random_state=42)
    lshf.fit(train_data)
    countarrLSH = lshFunct(test, val, n_feat, lshf, train_labels)
    return countarrLSH
 def __vectorize_corpus(self):
     self.lsh = LSHForest(n_estimators=200, 
                          n_neighbors=self.num_topics)
     self.vectorized_docs = []
     for text in self.texts:
         bow = self.dictionary.doc2bow(text)
         vectorized_doc = [x[1] for x in self.model.get_document_topics(bow, 
                                                                       minimum_probability=0.0)]
         self.vectorized_docs.append(vectorized_doc)
     self.lsh.fit(self.vectorized_docs)
Beispiel #42
0
 def BuildModel(self, data, labels):
   # Create and train the classifier.
   lshf = LSHForest(n_estimators = self.n_estimators,
                    min_hash_match = self.min_hash_match,
                    n_candidates = self.n_candidates,
                    radius_cutoff_ratio = self.radius_cutoff_ratio,
                    radius = self.radius,
                    n_neighbors = self.n_neighbors)
   lshf.fit(data)
   return lshf
Beispiel #43
0
class LSHForestSearch:
    def __init__(self, features, k):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=k)
        self.k = k
        
        self.lshf.fit(features)
        
    def search(self, features):
        
        return self.lshf.kneighbors(features, return_distance=False, n_neighbors=self.k)    
 def __init__(self, docs):
     self.lshf = LSHForest(n_estimators=1, n_candidates=1, n_neighbors=1)
     self.dv = DictVectorizer()
     dicts = []
     for d in docs:
         dicts.append(dict([(w, 1) for w in d]))
     self.dv.fit(dicts)
     features = self.dv.transform(dicts)
     # floats are faster
     # features = csr_matrix(features, dtype=int)
     self.lshf.fit(features)
Beispiel #45
0
class LHSForestEngine:

    def __init__(self):
        self.engine = LSHForest(random_state=42)
        self.name = "LHS"

    def fit(self, data):
        self.engine.fit(data)

    def dist(self, data):
        distances, indices = self.engine.kneighbors(data, n_neighbors=1)
        return distances.ravel()
Beispiel #46
0
 def calculate_duplication_number(self,text_list):
     print "length is ", len(text_list)
     tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5))
     #print text_list
     tf = tf_vectorizer.fit_transform(text_list)
     #print tf_vectorizer.get_feature_names()
     print tf[0]
     #print tf[123]
     lshf = LSHForest()
     #print tf
     lshf.fit(tf)
     distance,index = lshf.kneighbors(tf,n_neighbors=1)
     print distance, index
Beispiel #47
0
def startQuery():
    while True:

      try:
          ipt = raw_input('Directory of query:')
      except ImportError:
          print 'invalid type'
      else:
          query = ipt
      if query == 'exit()':
          break

      

    

      print 'loading query...'
      try:
          token = get_tokens_by_dir(query)
      except IOError:
          print 'invalid file name'
      else:
##########################################query preprocessing
           print 'query pre-processing...'
           stopped_tokens = [i for i in token if not i in en_stop]
           p_stemmer = PorterStemmer()
           stemed_tokens = []
           for i in stopped_tokens:
               try:
                   temp_token = str(p_stemmer.stem(i))
                   stemed_tokens.append(temp_token)
               except IndexError:
                   pass
           tokens = [stemed_tokens]
######################################################################################
           dictionary_new = corpora.Dictionary(tokens)
           corpus_new = [dictionary_new.doc2bow(text) for text in tokens]
           QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query

           new_topics = LDA[corpus_new]


           for i in new_topics[0]:
               print(i)
               QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix

           print 'fetching results for you...'
           lshf = LSHForest(random_state=42)
           lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET
           dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20)
           print indices
Beispiel #48
0
def fit_lshf(data):
    logger.info('Fitting  LSHForest...')
    from sklearn.neighbors import LSHForest
    lshf = LSHForest(
        n_estimators=20,
        min_hash_match=4,
        n_candidates=200,
        n_neighbors=2,
        radius=1.0,
        radius_cutoff_ratio=0.9,
        random_state=None,
    )
    lshf.fit(data)
    return lshf
    def fit_model(self, model_type='brute', params=None):
        '''
        fits model operating under the assumption that there's a model already built
        '''

        if model_type == 'brute':
            self.model = NearestNeighbors(algorithm='brute', **params)
        elif model_type == 'lsh':
            self.model = LSHForest( **params)
        # elif model_type == 'annoy':
        #     self.model = Annoy(**params)

        self.model.fit(self.vector_space)
        print self.model        
Beispiel #50
0
 def __init__(self, lsh_init=None):
     if lsh_init == None:
         self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
     else:
         self._lsh_forest = lsh_init 
     self.iw = None
     self.m = None
 def __init__(self):
     self.unknown = ''
     self.same_person_num = 1
     self.has_cal_dist = []
     self.NeighbourNum = 10
     # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
     self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self'     # 研究院的模型直接存储特征
     # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
     self.all_pic_data_folder = '/data/liubo/face/research_self'
     if not os.path.exists(self.all_pic_data_folder):
         os.makedirs(self.all_pic_data_folder)
     if not os.path.exists(self.all_pic_feature_data_folder):
         os.makedirs(self.all_pic_feature_data_folder)
     self.n_neighbors = 10
     self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
     self.all_labels = []
     self.all_pic_feature = []
     self.same_pic_id = 2
     self.must_be_same_id = 1
     self.must_be_not_same_id = 0
     self.maybe_same_id = 3
     self.new_person_str = 'new_person_'
     self.current_new_person_id = self.find_current_new_person_id()
     self.must_same_str = '_Must_Same'
     self.maybe_same_str = '_Maybe_same'
     self.load_time = time.time()
     self.user_count = {}
     self.upper_threshold = upper_verif_threshold
     self.lower_threshold = lower_verif_threshold
     self.same_pic_threshold = same_pic_threshold
     self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                       self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
     self.nearest = deque(maxlen=nearest_num)
     self.verification_same_person = 0
 def __init__(self):
     self.unknown = ''
     self.same_person_num = 1
     self.has_save_pic_feature = []
     self.has_cal_dist = []
     self.NeighbourNum = 10
     self.all_pic_data_folder = '/data/liubo/face/self'
     self.other_dataset_para_add = 1
     self.n_neighbors = 5
     self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
     self.all_labels = []
     self.all_pic_feature = []
     self.same_pic_id = 2
     self.must_be_same_id = 1
     self.must_be_not_same_id = 0
     self.maybe_same_id = 3
     self.new_person_str = 'new_person_'
     self.current_new_person_id = self.find_current_new_person_id()
     self.must_same_str = '_Must_Same'
     self.maybe_same_str = '_Maybe_same'
     self.load_time = time.time()
     self.user_count = {}
     # 不同的模型阈值不相同
     self.upper_threshold = upper_verif_threshold
     self.lower_threshold = lower_verif_threshold
     self.same_pic_threshold = same_pic_threshold
     self.pitch_threshold = 20
     self.yaw_threshold = 20
     self.roll_threshold = 20
     #  [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算)
     self.nearest = deque(maxlen=nearest_num)
     self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                       self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
     self.verification_same_person = 0
Beispiel #53
0
 def vectorized(self, num_topics=DefaultSetting.NUMBER_TOPICS):
     self.lsh = LSHForest(n_estimators=DefaultSetting.HASH_SIZE, n_neighbors=10)
     docs_bow = [self.doc_corpus.dictionary.doc2bow(content.split(u' '))
                 for content in self.doc_corpus.documents]
     for doc_bow in docs_bow:
         vectorized_doc = [x[1] for x in self.model.get_document_topics(doc_bow, minimum_probability=0.0)]
         self.vectorized_docs.append(vectorized_doc)
     self.lsh.fit(self.vectorized_docs)
def cal_acc(pack_file, stat_file, feature_dim):
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)

        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                traceback.print_exc()
                continue
        # 对于每个人,分别统计准确率
        person_acc_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        filter_num = 0
        all_num = 0
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                # if cos_sim > sim_threshold:
                if True:
                    if label == all_valid_label[index]:
                        person_acc_dic[label] = person_acc_dic.get(label, 0) + 1
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                    else:
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                else:
                    filter_num += 1
                all_num += 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num)
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_acc_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)
def test_graphs():
    # Smoke tests for graph methods.
    n_samples_sizes = [5, 10, 20]
    n_features = 3
    rng = np.random.RandomState(42)

    for n_samples in n_samples_sizes:
        X = rng.rand(n_samples, n_features)
        lshf = LSHForest(min_hash_match=0)
        ignore_warnings(lshf.fit)(X)

        kneighbors_graph = lshf.kneighbors_graph(X)
        radius_neighbors_graph = lshf.radius_neighbors_graph(X)

        assert_equal(kneighbors_graph.shape[0], n_samples)
        assert_equal(kneighbors_graph.shape[1], n_samples)
        assert_equal(radius_neighbors_graph.shape[0], n_samples)
        assert_equal(radius_neighbors_graph.shape[1], n_samples)
def test_kneighbors():
    """Checks whether desired number of neighbors are returned.

    It is guaranteed to return the requested number of neighbors
    if `min_hash_match` is set to 0. Returned distances should be
    in ascending order.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")
def lshf_scikit(data, n_neighbors=4,
               n_estimators=10,
               min_hash_match=4,
               n_candidates=10,
               random_state=None):
   n_neighbors += 1

   # initialize nearest neighbor model
   nbrs = LSHForest(n_neighbors=n_neighbors,
                    n_estimators = 10,
                    min_hash_match = 4,
                    n_candidates = 10,
                    random_state = 0)

   # fit nearest neighbor model to the data
   nbrs.fit(data)

   # return the distances and indices
   return nbrs.kneighbors(data)
def cal_recall(pack_file, stat_file, feature_dim):
    # f_model = open('verf.txt', 'w')
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)
        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                continue
        # 对于每个人,分别统计准确率
        person_find_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                real_label = all_valid_label[index]
                # if cos_sim > sim_threshold:
                if True:
                    if label == real_label:
                        # f_model.write('0'+'\t'+str(cos_sim)+'\n')
                        person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
                    else:
                        # f_model.write('1' + '\t' + str(cos_sim) + '\n')
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_find_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))