Ejemplos de LSHForest en Python, ejemplos de sklearn.neighbors.LSHForest en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: tiny.py Proyecto: bangnk/tu_anh

def text_hist():
    """
    Calculate histogram of text of images
    """
    with open('data/sift_names.pkl', 'r') as f:
        names = cPickle.load(f)
    with open('data/sift_hist.pkl', 'r') as f:
        sift_hists = cPickle.load(f)
    filenames = []
    for name in names:
        name = name.replace('img', 'descr')
        name = name.replace('.jpg', '.txt')
        filenames.append('shopping/images/' + name)
    vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
    xall_transformed = vectorizer.fit_transform(filenames).tocsr()
    preprocessing.normalize(xall_transformed, copy=False)

    lamb = .5
    hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
    preprocessing.normalize(hists, copy=False)
    model = LSHForest()
    model.fit(hists)
    with open('data/text_hist.pkl', 'w') as f:
        cPickle.dump(xall_transformed, f)
    with open('data/vectorizer.pkl', 'w') as f:
        cPickle.dump(vectorizer, f)
    with open('data/lshforest_combine.pkl', 'w') as f:
        cPickle.dump(model, f)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: bow_lsh.py Proyecto: zzx2017/Training

 def fit_lsh(self):
     self.lsh = LSHForest(random_state=12345)
     train_data = [
         self.encode_sentence(self.indexed_background[i], True)
         for i in range(len(self.indexed_background))
     ]
     self.lsh.fit(train_data)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: util_funcs.py Proyecto: sagittahjz/PointCNN

def knn_indices_func_approx(
        rep_pts: FloatTensor,  # (N, pts, dim)
        pts: FloatTensor,  # (N, x, dim)
        K: int,
        D: int) -> LongTensor:  # (N, pts, K)
    """
    Approximate CPU-based Indexing function based on K-Nearest Neighbors search.
    :param rep_pts: Representative points.
    :param pts: Point cloud to get indices from.
    :param K: Number of nearest neighbors to collect.
    :param D: "Spread" of neighboring points.
    :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:]
    is the set k-nearest neighbors for the representative points in pts[n].
    """
    if rep_pts.is_cuda:
        rep_pts = rep_pts.cpu()
    if pts.is_cuda:
        pts = pts.cpu()
    rep_pts = rep_pts.data.numpy()
    pts = pts.data.numpy()

    region_idx = []

    for n, p in enumerate(rep_pts):
        P_particular = pts[n]
        lshf = LSHForest(n_estimators=20,
                         n_candidates=100,
                         n_neighbors=D * K + 1)
        lshf.fit(P_particular)
        indices = lshf.kneighbors(p, return_distance=False)
        region_idx.append(indices[:, 1::D])

Ejemplo n.º 4

0

Mostrar archivo

Archivo: networkinducer.py Proyecto: viveksck/langchange

class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """
    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m,
                                                          n_neighbors=nn + 1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest

    def get_node_to_word(self):
        return self.iw

Ejemplo n.º 5

0

Mostrar archivo

def lof(X, k, outlier_threshold=1.5, verbose=False):
    """Knn with KD trees"""
    start = time.time()
    lshf = LSHForest(random_state=42)
    lshf.fit(X)

    distance, index= lshf.kneighbors(X,n_neighbors=k)
    distance, index = distance[:, 1:], index[:, 1:]
    radius = distance[:, -1]

    """Calculate LRD."""
    LRD = np.mean(np.maximum(distance, radius[index]), axis=1)
    r = 1. / np.array(LRD)

    """Calculate outlier score."""
    outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16)
    outlier_score *= 1. / k

    # print ('Compute time: %g seconds.' % ((time.time() - start)))

    if verbose: print("Recording all outliers with outlier score greater than %s." \
                      % (outlier_threshold))

    outliers = []
    """ Could parallelize this for loop, but really not worth the overhead...
        Would get insignificant performance gain."""
    for i, score in enumerate(outlier_score):
        if score > outlier_threshold:
            outliers.append([i,X[i], score])

    if verbose:
        print("Detected outliers:")
        print(outliers)

    return outliers

Ejemplo n.º 6

0

Mostrar archivo

Archivo: networkinducer.py Proyecto: viveksck/langchange

class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """

    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init 
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest
    
    def get_node_to_word(self):
        return self.iw

Ejemplo n.º 7

0

Mostrar archivo

Archivo: tweet_processor_bigdist.py Proyecto: ilyaaltshteyn/tweet_pre_processor

    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices

Ejemplo n.º 8

0

Mostrar archivo

Archivo: tweetPreprocessor.py Proyecto: ilyaaltshteyn/danger_tweets

    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices

Ejemplo n.º 9

0

Mostrar archivo

Archivo: networkinducer.py Proyecto: viveksck/langchange

 def __init__(self, lsh_init=None):
     if lsh_init == None:
         self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
     else:
         self._lsh_forest = lsh_init
     self.iw = None
     self.m = None

Ejemplo n.º 10

0

Mostrar archivo

def train():
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)
    sku_names_with_spaces = []
    for sku_names in sku_names_jieba:
        sku_names_with_spaces.append(' '.join(sku_names))

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)
    keywords_with_spaces = []
    for keywords in keywords_jieba:
        keywords_with_spaces.append(' '.join(keywords))

    tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1)
    x_train = tfidf_vec.fit_transform(sku_names_with_spaces)

    lshf = LSHForest(random_state=42)
    #lshf.fit(np.array(x_train))
    lshf.fit(x_train)

    for i, kw in enumerate(keywords_with_spaces):
        x_test = tfidf_vec.transform([kw])
        distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1)
        idx = indices[0][0]
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf:
            wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: lsh_forest.py Proyecto: mmaisel/rogers

class Index(BaseIndex):
    """ LSH Forest Index
    """

    name = 'lsh_forest'

    def _fit(self, xs):
        """ Fit index
        :param samples: list of Samples
        :return:
        """
        self.index = LSHForest(
            n_estimators=self.parameters.get('n_estimators', 20))
        self.index.fit(xs)

    def _query(self, sample, k=5, **kwargs):
        """ Query index
        :param sample: Sample
        :param k:
        :param kwargs:
        :return:
        """
        x, _, = self.transform([sample])
        distances, idxs = self.index.kneighbors(x, n_neighbors=k + 1)
        neighbors = []
        for idx, d in zip(idxs[0], distances[0]):
            hashval = self.ys[idx]
            neighbors.append({
                'hashval': hashval,
                'similarity': min(1 - float(d), 1.0)
            })
        return neighbors

Ejemplo n.º 12

0

Mostrar archivo

Archivo: nearest_neighbor_algorithms.py Proyecto: mohdkashif93/deep_qa

class ScikitLearnLsh(NearestNeighborAlgorithm):
    """
    This ``NearestNeighborAlgorithm`` uses scikit-learn's implementation of a locality sensitive
    hash to find approximate nearest neighbors.

    Parameters
    ----------
    random_state: int, optional (default=12345)
        Used to initialize the LSHForest, so that runs are consistent.
    """
    def __init__(self, params: Dict[str, Any]):
        random_state = params.pop('random_state', 12345)
        self.lsh = LSHForest(random_state=random_state)

    def fit(self, vectors: List[numpy.array]):
        logger.info("Fitting LSH with %d vectors", len(vectors))
        self.lsh.fit(vectors)

    def get_neighbors(self, query_vector: numpy.array,
                      num_neighbors: int) -> List[Tuple[int, float]]:
        if len(query_vector.shape) == 1:
            query_vector = [query_vector]
        logger.info("Getting neighbors for %d vectors", len(query_vector))
        scores, neighbor_indices = self.lsh.kneighbors(
            query_vector, n_neighbors=num_neighbors)
        logger.info("Neighbors retrieved")
        result = [
            zip(neighbor_indices[i], scores[i])
            for i in range(len(neighbor_indices))
        ]
        if len(result) == 1:
            result = result[0]
        return result

Ejemplo n.º 13

0

Mostrar archivo

Archivo: DataProcessor.py Proyecto: adobrovolskiy/CannelsClassifier

def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector}
    print("Creating LSHForest...")
    catArray = numpy.array(list(categories.values()))
    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")
    return lshf

Ejemplo n.º 14

0

Mostrar archivo

Archivo: views.py Proyecto: Soma2-HighFashion/Design_Studio

def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})

Ejemplo n.º 15

0

Mostrar archivo

Archivo: DataPreprocessorOld.py Proyecto: adobrovolskiy/CannelsClassifier

def Classify(nlp, keywords,
             categories):  #keywords  - list; categories - dict: {name; vector}
    counterDict = Counter(keywords)  #optimization for keywords duplicates
    sumVector = numpy.zeros(nlp.vocab.vectors_length)

    #temp
    text = ' '.join(keywords)

    for word, repCount in counterDict.items():  #summurizing words vectors
        curVect = nlp(word).vector
        sumVector += (curVect * repCount)

    vec = nlp(text).vector
    sim = cosine_similarity(vec, sumVector)
    print("Sim: " + str(sim))

    catArray = numpy.array(list(categories.values()))
    catKeys = list(categories.keys())
    #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity)
    #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1)

    print("Creating LSHForest...")

    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")

    print("Getting neighbors...")
    distances, indices = lshf.kneighbors(sumVector.reshape((1, -1)))
    print("Got neighbors.")

    for curIndex in numpy.nditer(indices):
        print("Found category: " + str(catKeys[curIndex]))
        print("with distance: " + str(distances))

Ejemplo n.º 16

0

Mostrar archivo

def persist_attraction_similarities_to_db():
    # build LSHForest model for reduced dimension dataset
    svd = TruncatedSVD(n_components=10, n_iter=7)
    red_dim_itemuserdf = svd.fit_transform(itemuserdf)
    item_user_model = LSHForest()
    item_user_model.fit(red_dim_itemuserdf)

    # persist attractions similarities to db
    K=20        # query for K neighbors
    k=10        # return k neighbors
    for i in range(itemuserdf.shape[0]):
        distance, indices = item_user_model.kneighbors(
            red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K
        )
        weights = 1 - distance
        for j in range(k):
            if i != indices[0][j]:
                e = SimilarAttractions(
                    attraction_id=Attraction.objects.filter(
                        app_id=int(i)).values('attraction_id')[0]['attraction_id'],
                    similar_attraction_id=Attraction.objects.filter(
                        app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'],
                    similarity=weights[0][j],
                    ts=timezone.now()
                )
                e.save()

Ejemplo n.º 17

0

Mostrar archivo

def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: directedsampler.py Proyecto: BackofenLab/Graphlearn_long_range

    def get_nearest_neighbor_iterable(self,
                                      graphlist,
                                      start_graphs,
                                      start_is_subset=True):

        # vectorize all
        graphlist = list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)

        start_graphs = list(start_graphs)
        graphlist_ = copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)

        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1

        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index])
                   for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi, Yi, dist in matches:
            yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: CC-Fu-CC/scikit-learn

def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))

Ejemplo n.º 20

0

Mostrar archivo

def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0,
                                              np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))

Ejemplo n.º 21

0

Mostrar archivo

Archivo: EvolutionMultiProcessing.py Proyecto: urialon/ECG

def score(factors):
    verifyCount = 3
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = FactorizeVectors(X, factors)
    test_set = FactorizeVectors(test_set, factors)
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for j in range(len(test_set)):
            total += 1
            actual = databases[j]
            distances, indices = clf.kneighbors(test_set[j], n_neighbors=5)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1

        if (correct > best_predictions):
            best_predictions = correct
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage

Ejemplo n.º 22

0

Mostrar archivo

Archivo: ScikitSeqs.py Proyecto: wilseypa/dataAnalysis-scripts

def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i):
    start = time.time()
    tree = LSHForest(random_state=42)
    tree.fit(data)
    end = time.time()

    return sys.getsizeof(tree), (end - start)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: AlexandreAbraham/scikit-learn

def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")

Ejemplo n.º 24

0

Mostrar archivo

Archivo: directedsampler.py Proyecto: antworteffekt/GraphLearn

    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))

Ejemplo n.º 25

0

Mostrar archivo

Archivo: discsampler.py Proyecto: smautner/GraphLearn

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

Ejemplo n.º 26

0

Mostrar archivo

Archivo: LSH.py Proyecto: elliottpiercy/Image-retrieval

    def optimise(self, num_train_points, num_val_points, parameters):

        max_accuracy = -1
        optimal_estimators = -1
        optimal_n_neighbours = -1

        for item in self.get_generator(parameters):

            LSHf = LSHForest(random_state=42,
                             n_estimators=item['n_est'],
                             n_neighbors=item['n_neigh'])
            LSHf.fit(self.train.images[:num_train_points])
            distances, indices = LSHf.kneighbors(
                self.validation.images[:num_val_points], n_neighbors=5)

            accuracy, positions = self.model_accuracy(indices,
                                                      is_optimising=True)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                optimal_estimators = item['n_est']
                optimal_n_neighbours = item['n_neigh']

#         print(optimal_n_neighbours_predict)
        return max_accuracy, optimal_estimators, optimal_n_neighbours

Ejemplo n.º 27

0

Mostrar archivo

    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius=.4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices

Ejemplo n.º 28

0

Mostrar archivo

Archivo: differentiable_search.py Proyecto: mohdkashif93/deep_qa

    def __init__(self, params: Dict[str, Any]):
        # Location of corpus to use for background knowledge search. This corpus is assumed to be
        # gzipped, one sentence per line.
        self.corpus_path = params.pop('corpus_path', None)

        # Number of background sentences to collect for each input.
        self.num_background = params.pop('num_background', 10)
        # Wait this many epochs before running differentiable search. This lets you train with the
        # base memory network code using external background knowledge for a time, then, once the
        # encoder is trained sufficiently, you can turn on the differentiable search.
        self.num_epochs_delay = params.pop('num_epochs_delay', 10)

        # Number of epochs we wait in between re-encoding the corpus.
        # TODO(matt): consider only re-encoding at early stopping, instead of a
        # number-of-epoch-based parameter.
        self.num_epochs_per_encoding = params.pop('num_epochs_per_encoding', 2)

        # Only meaningful if you are loading a model.  When loading, should we load a pickled LSH,
        # or should we re-initialize the LSH from the input corpus?  Note that if you give a corpus
        # path, and you load a saved LSH that was constructed from a _different_ corpus, you could
        # end up with really weird behavior.
        self.load_saved_lsh = params.pop('load_saved_lsh', False)

        # Now that we've popped our parameters, we can call the superclass constructor.
        super(DifferentiableSearchMemoryNetwork, self).__init__(params)

        # And then set some member variables.
        self._sentence_encoder_model = self.__build_sentence_encoder_model()
        self.lsh = LSHForest(random_state=12345)
        self.instance_index = {}  # type: Dict[int, str]

Ejemplo n.º 29

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: cnspica/scikit-learn

def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0, np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: cnspica/scikit-learn

def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))

Ejemplo n.º 31

0

Mostrar archivo

def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query,
                                        n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")

Ejemplo n.º 32

0

Mostrar archivo

Archivo: index_embedding.py Proyecto: ogrisel/lsh_glove

def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
    lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
                     n_neighbors=n_neighbors, random_state=seed)
    t0 = time()
    lshf.fit(data)
    duration = time() - t0
    return lshf, duration

Ejemplo n.º 33

0

Mostrar archivo

Archivo: LSH.py Proyecto: elliottpiercy/Image-retrieval

    def fit_model(self, data, n_estimators, n_neighbours):

        LSHf = LSHForest(random_state=42,
                         n_estimators=n_estimators,
                         n_neighbors=n_neighbours)
        LSHf.fit(data)
        return LSHf

Ejemplo n.º 34

0

Mostrar archivo

Archivo: matcher.py Proyecto: uvacorpnet/name_matching

 def create_tree(self,listNames,variableName):
     #LSHForest. only once for the main database
     lshf = LSHForest(n_estimators=50,n_candidates=500)
     TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
     lshf.fit(tfidfs)        
     pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))

Ejemplo n.º 35

0

Mostrar archivo

Archivo: lsh_forest.py Proyecto: mmaisel/rogers

 def _fit(self, xs):
     """ Fit index
     :param samples: list of Samples
     :return:
     """
     self.index = LSHForest(
         n_estimators=self.parameters.get('n_estimators', 20))
     self.index.fit(xs)

Ejemplo n.º 36

0

Mostrar archivo

Archivo: similarity.py Proyecto: vishalv95/RecommenderCentrality

def hash_movie_similarity(um, num_neighbors=6):
    lsh = LSHForest(random_state=470957)
    lsh.fit(um.T)

    # Don't compare to self, remove first column, call 7 neighbors
    dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True)
    sim = 1 - dist
    return sim[:,1:], ind[:,1:]

Ejemplo n.º 37

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: AlexandreAbraham/scikit-learn

def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])

Ejemplo n.º 38

0

Mostrar archivo

def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])

Ejemplo n.º 39

0

Mostrar archivo

class LSH_KNN:
    def __init__(self, weights='uniform', **kwargs):
        self.n_neighbors = kwargs['n_neighbors']
        self.lsh = LSHForest(**kwargs)
        self.weights = weights

    def fit(self, X, y):
        self.y = y
        self.X = X
        self.lsh.fit(X)

    def predict_top_n(self, test_X, n):
        _, indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        votes = np.zeros((len(test_X), n))
        for i in range(len(indices)):
            votes[i] = np.bincount([self.y[j] for j in indices[i]]).argsort()[-n:][::-1]
        return votes.astype(int)

    def predict_proba(self, test_X, return_dists=False):
        # SMOOTHING PARAMETER TO PREVENT 0 PROBA; https://stats.stacketest_xchange.com/questions/83600/how-to-obtain-the-class-conditional-probability-when-using-knn-classifier
        s = 0.1
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        dists = []
        proba = np.zeros((len(test_X), np.amatest_x(self.y) + 1))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights, minlength=np.amatest_x(self.y)+1)
            proba[test_point] = np.true_divide(weighted_class_counts + s, np.sum(weighted_class_counts) + len(weighted_class_counts)*s)
            if return_dists:
                test_point_dists = {}
                for neighbor_index in neighbor_indices[test_point]:
                    if self.y[neighbor_index] not in test_point_dists:
                        self.y[neighbor_index] = []
                    test_point_dists[self.y[neighbor_index]].append(dist(test_X[test_point], self.X[neighbor_index]))
                dists.append(test_point_dists)
        if return_dists:
            return proba, dists
        return proba

    def predict(self, test_X):
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        result = np.zeros(len(test_X))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights)
            result[test_point] = np.argmatest_x(weighted_class_counts)
        return result.astype(int)

    def dist(self, a, b):
        return np.linalg.norm(a - b)

Ejemplo n.º 40

0

Mostrar archivo

Archivo: mnist.py Proyecto: blaze225/lsh

def trainLSH(train, test, val):
    n_feat = train[0].size
    train_data = train[:, :-1]
    train_labels = train[:, n_feat - 1]
    val_data = val[:, :-1]
    val_labels = val[:, n_feat - 1]
    lshf = LSHForest(random_state=42)
    lshf.fit(train_data)
    countarrLSH = lshFunct(test, val, n_feat, lshf, train_labels)
    return countarrLSH

Ejemplo n.º 41

0

Mostrar archivo

Archivo: LDAModel.py Proyecto: Zhangh2018/Course-Experiments

 def __vectorize_corpus(self):
     self.lsh = LSHForest(n_estimators=200, 
                          n_neighbors=self.num_topics)
     self.vectorized_docs = []
     for text in self.texts:
         bow = self.dictionary.doc2bow(text)
         vectorized_doc = [x[1] for x in self.model.get_document_topics(bow, 
                                                                       minimum_probability=0.0)]
         self.vectorized_docs.append(vectorized_doc)
     self.lsh.fit(self.vectorized_docs)

Ejemplo n.º 42

0

Mostrar archivo

Archivo: LSHForest.py Proyecto: gxieaa/benchmarks

 def BuildModel(self, data, labels):
   # Create and train the classifier.
   lshf = LSHForest(n_estimators = self.n_estimators,
                    min_hash_match = self.min_hash_match,
                    n_candidates = self.n_candidates,
                    radius_cutoff_ratio = self.radius_cutoff_ratio,
                    radius = self.radius,
                    n_neighbors = self.n_neighbors)
   lshf.fit(data)
   return lshf

Ejemplo n.º 43

0

Mostrar archivo

class LSHForestSearch:
    def __init__(self, features, k):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=k)
        self.k = k
        
        self.lshf.fit(features)
        
    def search(self, features):
        
        return self.lshf.kneighbors(features, return_distance=False, n_neighbors=self.k)

Ejemplo n.º 44

0

Mostrar archivo

Archivo: pysparnn_utils.py Proyecto: subodhchhabra/pysparnn

 def __init__(self, docs):
     self.lshf = LSHForest(n_estimators=1, n_candidates=1, n_neighbors=1)
     self.dv = DictVectorizer()
     dicts = []
     for d in docs:
         dicts.append(dict([(w, 1) for w in d]))
     self.dv.fit(dicts)
     features = self.dv.transform(dicts)
     # floats are faster
     # features = csr_matrix(features, dtype=int)
     self.lshf.fit(features)

Ejemplo n.º 45

0

Mostrar archivo

Archivo: nbnn.py Proyecto: enoonIT/nbnn-nbnl

class LHSForestEngine:

    def __init__(self):
        self.engine = LSHForest(random_state=42)
        self.name = "LHS"

    def fit(self, data):
        self.engine.fit(data)

    def dist(self, data):
        distances, indices = self.engine.kneighbors(data, n_neighbors=1)
        return distances.ravel()

Ejemplo n.º 46

0

Mostrar archivo

Archivo: irobot_crawl.py Proyecto: rivercold/webStructure

 def calculate_duplication_number(self,text_list):
     print "length is ", len(text_list)
     tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5))
     #print text_list
     tf = tf_vectorizer.fit_transform(text_list)
     #print tf_vectorizer.get_feature_names()
     print tf[0]
     #print tf[123]
     lshf = LSHForest()
     #print tf
     lshf.fit(tf)
     distance,index = lshf.kneighbors(tf,n_neighbors=1)
     print distance, index

Ejemplo n.º 47

0

Mostrar archivo

Archivo: lda_read_model.py Proyecto: wylswz/FYPLinux

def startQuery():
    while True:

      try:
          ipt = raw_input('Directory of query:')
      except ImportError:
          print 'invalid type'
      else:
          query = ipt
      if query == 'exit()':
          break

      

    

      print 'loading query...'
      try:
          token = get_tokens_by_dir(query)
      except IOError:
          print 'invalid file name'
      else:
##########################################query preprocessing
           print 'query pre-processing...'
           stopped_tokens = [i for i in token if not i in en_stop]
           p_stemmer = PorterStemmer()
           stemed_tokens = []
           for i in stopped_tokens:
               try:
                   temp_token = str(p_stemmer.stem(i))
                   stemed_tokens.append(temp_token)
               except IndexError:
                   pass
           tokens = [stemed_tokens]
######################################################################################
           dictionary_new = corpora.Dictionary(tokens)
           corpus_new = [dictionary_new.doc2bow(text) for text in tokens]
           QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query

           new_topics = LDA[corpus_new]


           for i in new_topics[0]:
               print(i)
               QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix

           print 'fetching results for you...'
           lshf = LSHForest(random_state=42)
           lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET
           dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20)
           print indices

Ejemplo n.º 48

0

Mostrar archivo

Archivo: ann.py Proyecto: Curly-Mo/sample-recognition

def fit_lshf(data):
    logger.info('Fitting  LSHForest...')
    from sklearn.neighbors import LSHForest
    lshf = LSHForest(
        n_estimators=20,
        min_hash_match=4,
        n_candidates=200,
        n_neighbors=2,
        radius=1.0,
        radius_cutoff_ratio=0.9,
        random_state=None,
    )
    lshf.fit(data)
    return lshf

Ejemplo n.º 49

0

Mostrar archivo

Archivo: deduper_class.py Proyecto: asharma567/chat_bot_w_RNN

    def fit_model(self, model_type='brute', params=None):
        '''
        fits model operating under the assumption that there's a model already built
        '''

        if model_type == 'brute':
            self.model = NearestNeighbors(algorithm='brute', **params)
        elif model_type == 'lsh':
            self.model = LSHForest( **params)
        # elif model_type == 'annoy':
        #     self.model = Annoy(**params)

        self.model.fit(self.vector_space)
        print self.model

Ejemplo n.º 50

0

Mostrar archivo

Archivo: networkinducer.py Proyecto: viveksck/langchange

 def __init__(self, lsh_init=None):
     if lsh_init == None:
         self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
     else:
         self._lsh_forest = lsh_init 
     self.iw = None
     self.m = None

Ejemplo n.º 51

0

Mostrar archivo

Archivo: recognize_server_research.py Proyecto: ustbliubo2014/FaceRecognition

 def __init__(self):
     self.unknown = ''
     self.same_person_num = 1
     self.has_cal_dist = []
     self.NeighbourNum = 10
     # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
     self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self'     # 研究院的模型直接存储特征
     # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
     self.all_pic_data_folder = '/data/liubo/face/research_self'
     if not os.path.exists(self.all_pic_data_folder):
         os.makedirs(self.all_pic_data_folder)
     if not os.path.exists(self.all_pic_feature_data_folder):
         os.makedirs(self.all_pic_feature_data_folder)
     self.n_neighbors = 10
     self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
     self.all_labels = []
     self.all_pic_feature = []
     self.same_pic_id = 2
     self.must_be_same_id = 1
     self.must_be_not_same_id = 0
     self.maybe_same_id = 3
     self.new_person_str = 'new_person_'
     self.current_new_person_id = self.find_current_new_person_id()
     self.must_same_str = '_Must_Same'
     self.maybe_same_str = '_Maybe_same'
     self.load_time = time.time()
     self.user_count = {}
     self.upper_threshold = upper_verif_threshold
     self.lower_threshold = lower_verif_threshold
     self.same_pic_threshold = same_pic_threshold
     self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                       self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
     self.nearest = deque(maxlen=nearest_num)
     self.verification_same_person = 0

Ejemplo n.º 52

0

Mostrar archivo

Archivo: recognize_server_v2.py Proyecto: ustbliubo2014/FaceRecognition

 def __init__(self):
     self.unknown = ''
     self.same_person_num = 1
     self.has_save_pic_feature = []
     self.has_cal_dist = []
     self.NeighbourNum = 10
     self.all_pic_data_folder = '/data/liubo/face/self'
     self.other_dataset_para_add = 1
     self.n_neighbors = 5
     self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
     self.all_labels = []
     self.all_pic_feature = []
     self.same_pic_id = 2
     self.must_be_same_id = 1
     self.must_be_not_same_id = 0
     self.maybe_same_id = 3
     self.new_person_str = 'new_person_'
     self.current_new_person_id = self.find_current_new_person_id()
     self.must_same_str = '_Must_Same'
     self.maybe_same_str = '_Maybe_same'
     self.load_time = time.time()
     self.user_count = {}
     # 不同的模型阈值不相同
     self.upper_threshold = upper_verif_threshold
     self.lower_threshold = lower_verif_threshold
     self.same_pic_threshold = same_pic_threshold
     self.pitch_threshold = 20
     self.yaw_threshold = 20
     self.roll_threshold = 20
     #  [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算)
     self.nearest = deque(maxlen=nearest_num)
     self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                       self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
     self.verification_same_person = 0

Ejemplo n.º 53

0

Mostrar archivo

Archivo: docsim.py Proyecto: khoaipx/Document-Similarity

 def vectorized(self, num_topics=DefaultSetting.NUMBER_TOPICS):
     self.lsh = LSHForest(n_estimators=DefaultSetting.HASH_SIZE, n_neighbors=10)
     docs_bow = [self.doc_corpus.dictionary.doc2bow(content.split(u' '))
                 for content in self.doc_corpus.documents]
     for doc_bow in docs_bow:
         vectorized_doc = [x[1] for x in self.model.get_document_topics(doc_bow, minimum_probability=0.0)]
         self.vectorized_docs.append(vectorized_doc)
     self.lsh.fit(self.vectorized_docs)

Ejemplo n.º 54

0

Mostrar archivo

Archivo: experiment.py Proyecto: ustbliubo2014/FaceRecognition

def cal_acc(pack_file, stat_file, feature_dim):
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)

        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                traceback.print_exc()
                continue
        # 对于每个人,分别统计准确率
        person_acc_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        filter_num = 0
        all_num = 0
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                # if cos_sim > sim_threshold:
                if True:
                    if label == all_valid_label[index]:
                        person_acc_dic[label] = person_acc_dic.get(label, 0) + 1
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                    else:
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                else:
                    filter_num += 1
                all_num += 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num)
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_acc_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')

Ejemplo n.º 55

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: cnspica/scikit-learn

def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)

Ejemplo n.º 56

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: AlexandreAbraham/scikit-learn

def test_graphs():
    # Smoke tests for graph methods.
    n_samples_sizes = [5, 10, 20]
    n_features = 3
    rng = np.random.RandomState(42)

    for n_samples in n_samples_sizes:
        X = rng.rand(n_samples, n_features)
        lshf = LSHForest(min_hash_match=0)
        ignore_warnings(lshf.fit)(X)

        kneighbors_graph = lshf.kneighbors_graph(X)
        radius_neighbors_graph = lshf.radius_neighbors_graph(X)

        assert_equal(kneighbors_graph.shape[0], n_samples)
        assert_equal(kneighbors_graph.shape[1], n_samples)
        assert_equal(radius_neighbors_graph.shape[0], n_samples)
        assert_equal(radius_neighbors_graph.shape[1], n_samples)

Ejemplo n.º 57

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: cnspica/scikit-learn

def test_kneighbors():
    """Checks whether desired number of neighbors are returned.

    It is guaranteed to return the requested number of neighbors
    if `min_hash_match` is set to 0. Returned distances should be
    in ascending order.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")

Ejemplo n.º 58

0

Mostrar archivo

Archivo: knn_solvers.py Proyecto: jejjohnson/manifold_learning

def lshf_scikit(data, n_neighbors=4,
               n_estimators=10,
               min_hash_match=4,
               n_candidates=10,
               random_state=None):
   n_neighbors += 1

   # initialize nearest neighbor model
   nbrs = LSHForest(n_neighbors=n_neighbors,
                    n_estimators = 10,
                    min_hash_match = 4,
                    n_candidates = 10,
                    random_state = 0)

   # fit nearest neighbor model to the data
   nbrs.fit(data)

   # return the distances and indices
   return nbrs.kneighbors(data)

Ejemplo n.º 59

0

Mostrar archivo

Archivo: experiment.py Proyecto: ustbliubo2014/FaceRecognition

def cal_recall(pack_file, stat_file, feature_dim):
    # f_model = open('verf.txt', 'w')
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)
        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                continue
        # 对于每个人,分别统计准确率
        person_find_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                real_label = all_valid_label[index]
                # if cos_sim > sim_threshold:
                if True:
                    if label == real_label:
                        # f_model.write('0'+'\t'+str(cos_sim)+'\n')
                        person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
                    else:
                        # f_model.write('1' + '\t' + str(cos_sim) + '\n')
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_find_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')

Ejemplo n.º 60

0

Mostrar archivo

Archivo: test_approximate.py Proyecto: CC-Fu-CC/scikit-learn

def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))