Example #1
0
def nearestNeighbor():
    patientsArr, patients, geneTags, patientTags, meanGeneVals = makeArray()
    
    neigh = NN(n_neighbors=5, radius=1.0)
    
    neigh.fit(patientsArr) 
#    ct = 0
    print '# patients', len(patientTags)
    print '# genes', len(geneTags)
    for i in range(len(patientTags)):
        for j in range(len(geneTags)):
            if (patients[i][j]=='NaN') or (patients[i][j]=='NaN\n') :
#                ct+=1
#                if ct>2:
#                    sys.exit()
                    
#                print 'patient ', i, ' has a missing value ' 
                nbrs = neigh.kneighbors(patientsArr[i])
                knbrs = nbrs[1]
#                print 'nearest neighbors are ', nbrs[0],'  ||||  ' ,nbrs[1]
#                print knbrs 
#                print 'new value for patients [', i+1, '][', j+1,'] = '
                patients[i][j]= retMissingVal(knbrs, patientsArr, j)
                #calculate the mean of the genevalues of these patients
#                pass
                # """fill in missing values"""
#                patients[i][j]=meanGeneVals[j]
            else:
                patients[i][j] = float(patients[i][j])
    fh_pickle = open('imputed_data', 'w')
    pickle.dump(patients, fh_pickle)
    fh_pickle.close()
    return patients
Example #2
0
def embedding_refinement(data_matrix_highdim,
                         data_matrix_lowdim,
                         n_neighbors=8,
                         emb_quality_th=1,
                         n_iter=20):
    # extract neighbors list for high dimensional case
    neigh_high = NearestNeighbors(n_neighbors=n_neighbors)
    neigh_high.fit(data_matrix_highdim)
    neighbors_list_highdim = neigh_high.kneighbors(data_matrix_highdim, return_distance=0)
    n_instances = data_matrix_lowdim.shape[0]
    logger.debug('refinements max num iters: %d  k in neqs: %d num insts: %d' %
                 (n_iter, n_neighbors, n_instances))
    for it in range(n_iter):
        average_embedding_quality_score, scores = knn_quality_score(data_matrix_lowdim,
                                                                    neighbors_list_highdim,
                                                                    n_neighbors)
        # select low quality embedded instances
        ids = [i for i, s in enumerate(scores)
               if relative_quality(i, scores, neighbors_list_highdim) <= emb_quality_th]
        # find average position of true knns and move point there
        new_data_matrix_lowdim = compute_average(ids, data_matrix_lowdim, neighbors_list_highdim)
        new_average_embedding_quality_score, new_scores = knn_quality_score(new_data_matrix_lowdim,
                                                                            neighbors_list_highdim,
                                                                            n_neighbors)
        if new_average_embedding_quality_score > average_embedding_quality_score:
            data_matrix_lowdim = new_data_matrix_lowdim
            n_refinements = len(ids)
            frac_refinements = float(n_refinements) / n_instances
            logger.debug('r %.2d neqs: %.3f \t %.2f (%d insts)' %
                         (it + 1, new_average_embedding_quality_score,
                          frac_refinements, n_refinements))
        else:
            break
    return data_matrix_lowdim
    def resample(self):
        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Finding nns
        from sklearn.neighbors import NearestNeighbors

        print("Finding the %i nearest neighbours..." % self.k, end = "")

        NN = NearestNeighbors(n_neighbors = self.k + 1)
        NN.fit(minx)
        nns = NN.kneighbors(minx, return_distance=False)[:, 1:]

        print("done!")

        # Creating synthetic samples
        print("Creating synthetic samples...", end="")
        sx, sy = make_samples(minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs)
        print("done!")

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
Example #4
0
def get_knn_score(data, targetdata, filenames, num=20):
    vectorizer = CountVectorizer()
    tfidfvectorizer = TfidfTransformer()

    counts = vectorizer.fit_transform(data)
    tfidf_data = tfidfvectorizer.fit_transform(counts)

    knn = NearestNeighbors(n_neighbors=num)
    knn.fit(tfidf_data)

    counts = vectorizer.transform(targetdata)
    tfidf_target_data = tfidfvectorizer.transform(counts)

    result = knn.kneighbors(tfidf_target_data)
    score = result[0][0]
    index = result[1][0]

    """
    for i in index.tolist():
        print files[i]
    for i in index.tolist():
    print map(float, score)
    print index.tolist()
    """
    #return index.tolist(), score.tolist()
    for i in index.tolist():
        fname = basename(filenames[i])
        copy(ORI_DIR + fname, TARGET_DIR + fname)
    def resample(self):
        from sklearn.neighbors import NearestNeighbors

        # Start with the minority class
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # Find the NNs for all samples in the data set.
        print("Finding the %i nearest neighbours..." % self.m, end = "")
        NN = NearestNeighbors(n_neighbors = self.m + 1)
        NN.fit(self.x)

        print("done!")

        # Boolean array with True for minority samples in danger
        index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx])

        # If all minority samples are safe, return the original data set.
        if not any(index):
            print('There are no samples in danger. No borderline synthetic samples created.')
            return self.x, self.y

        # Find the NNs among the minority class
        NN.set_params(**{'n_neighbors' : self.k + 1})
        NN.fit(minx)
        nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:]

        # Create synthetic samples for borderline points.
        sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs)

        # Concatenate the newly generated samples to the original data set
        ret_x = concatenate((self.x, sx), axis = 0)
        ret_y = concatenate((self.y, sy), axis = 0)

        return ret_x, ret_y
def random_forest_single_predict(test_filename, name, feature_file, train_file, k):
    name_list, data = readfile_real_name(test_filename)
    print 'reading file...'
    test_data = data[name_list.index(name)]
    with open(train_file, 'rb') as f:
        clf = cPickle.load(f)
    print 'done'
    result_rate = (clf.predict_proba(test_data))[0]
    class_name = clf.classes_
    print name
    num = map(get_num, result_rate)
    name_list, feature_list = readfile_real_name_group(feature_file, class_name, num)
    neigh = NearestNeighbors()
    neigh.fit(feature_list)
    kneighbors_result_list = neigh.kneighbors(test_data, k, False)[0]
    print kneighbors_result_list
    for x in kneighbors_result_list:
        print name_list[x]
    classification_result = []
    average_list = []
    real_name = (name.split('_'))[0]
    counter = Counter(kneighbors_result_list)
    if real_name == name_list[counter.most_common(1)[0][0]].split('_')[0]:
        classification_result.append(1)
    else:
        classification_result.append(0)
    num = 0
    for i in kneighbors_result_list:
        if (name_list[i].split('_'))[0] == real_name:
            num += 1
    average_list.append((float)(num) / (float)(k))
    print classification_result, average_list
    return classification_result, average_list
Example #7
0
    def sample(s):
        if s.data is None:
            raise ValueError('data not loaded.')
        mdl = NearestNeighbors(n_neighbors=s.k, n_jobs=-1)
        minoX = s.X[s.y == s.minolab]
        majX = s.X[s.y == s.majlab]
        mdl.fit(minoX)
        _, nei_table = mdl.kneighbors()

        generated = None
        for cnt, nei_idx in enumerate(nei_table):
            x = minoX[cnt]
            if s.rate >= 0.5 * s.k:
                nei = minoX[np.random.choice(nei_idx, int(s.rate))]
                new = x + np.random.rand(int(s.rate), 1) * (nei - x)

            else:
                nei = minoX[nei_idx]
                new = x + np.random.rand(s.k, 1) * (nei - x)
                # each of the synthesed k points has N/k * 100 % probability to be chosen
                new = new[np.random.rand(s.k) > s.rate * 1.0 / s.k]
            if generated is None:
                generated = new
            else:
                generated = np.vstack((generated, new))
        # number of generated instances
        N = len(generated)
        ret = np.hstack((np.vstack((minoX, generated, majX)),
                         np.array([s.minolab] * (minoX.shape[0] + N) + [s.majlab] * majX.shape[0])[:, None]))
        np.random.shuffle(ret)
        return ret
    def resample(self):
        """
        :return:
            Return the data with majority samples that form a Tomek link
            removed.
        """

        from sklearn.neighbors import NearestNeighbors

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(self.x)
        nns = nn.kneighbors(self.x, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        if self.verbose:
            print("Looking for majority Tomek links...")
        links = self.is_tomek(self.y, nns, self.minc, self.verbose)

        if self.verbose:
            print("Under-sampling "
                  "performed: " + str(Counter(self.y[logical_not(links)])))

        # Return data set without majority Tomek links.
        return self.x[logical_not(links)], self.y[logical_not(links)]
Example #9
0
def findKNN(frequencyVector, newVector):
    samples = np.array(frequencyVector)
    neigh = NearestNeighbors(n_neighbors = 5, metric = "euclidean")
    neigh.fit(samples)
    indexList = neigh.kneighbors(newVector, return_distance = False).tolist()

    return indexList
Example #10
0
def predict_appliance(home, appliance, feature):
    if home in all_homes[appliance]:
        home_to_pick=home
    else:
        home_to_pick=all_homes[appliance][0]
    print home_to_pick

    feature_dict = json.load(open("../data/output/sensitivity-numfeatures-allhomes/%s_%s_%d.json" %(appliance,feature, home_to_pick),"r"))
    f = feature_dict['f']
    k = feature_dict['k']
    clf = KNeighborsRegressor(n_neighbors=k)
    nn = NearestNeighbors(n_neighbors=k)
    df_new =df.copy()
    df_new = df_new.ix[all_homes[appliance]]
    df_new = df_new.ix[~df_new.index.isin([home])]
    #df_new = df_new.drop(home, axis=1)
    nn.fit(df_new[f].dropna())
    distances, indices = nn.kneighbors(df.ix[home][f])
    out = []
    nghbrs_list = df_new.index[indices].values[0]

    for month in range(1, 13):
        if len(nghbrs_list>1):
            out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].sum().values[0]/k)
        else:
            out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].values[0]/k)
    return out
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        # Create a k-NN to fit the whole data
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh)

        # Fit the whole dataset
        nn_obj.fit(self.x)

        idx_to_exclude = []
        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # Get the sample of the current class
            sub_samples_x = self.x[self.y == key]

            # Get the samples associated
            idx_sub_sample = np.nonzero(self.y == key)[0]

            # Find the NN for the current class
            nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False)

            # Get the label of the corresponding to the index
            nnhood_label = (self.y[nnhood_idx] == key)

            # Check which one are the same label than the current class
            # Make an AND operation through the three neighbours
            nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1))

            # If the minority class remove the majority samples (as in politic!!!! ;))
            if key == self.minc:
                # Get the index to exclude
                idx_to_exclude += nnhood_idx[np.nonzero(nnhood_label[np.nonzero(nnhood_bool)])].tolist()
            else:
                # Get the index to exclude
                idx_to_exclude += idx_sub_sample[np.nonzero(nnhood_bool)].tolist()

        # Create a vector with the sample to select
        sel_idx = np.ones(self.y.shape)
        sel_idx[idx_to_exclude] = 0

        # Get the samples from the majority classes
        sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :])
        sel_y = self.y[np.nonzero(sel_idx)]

        underx = concatenate((underx, sel_x), axis=0)
        undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
Example #12
0
def main():
    vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0)

    nei = NearestNeighbors(algorithm='brute', metric='jaccard')
    matrix = vectorizer.fit_transform(training_set).todense()
    new_matrix = vectorizer.transform(new_comments).todense()
    nei.fit(matrix)
    path =  '{0}/'.format(pathsplit(abspath(__file__))[0])
    jsonfile = open(path + '{0}-nn.json'.format(n_neighbors), 'w')

    nodes = [{'name': (training_set+new_comments)[i],
              'group':(groups + new_groups)[i]}
             for i in range(len(training_set+new_comments))]
    links = []

    for i in range(len(matrix)):
        dist, idnei = nei.kneighbors(matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]

        for j in range(len(idnei[1:])):
            links.append({"source":i,"target":idnei[j+1],"value":10*(1 - dist[j+1])})

    for i in range(len(new_comments)):
        dist, idnei = nei.kneighbors(new_matrix[i], n_neighbors=n_neighbors + 1)
        dist, idnei = dist[0], idnei[0]
        for j in range(len(idnei[1:])):
            links.append({"source":len(matrix) + i,"target":idnei[j],"value":10*(1 - dist[j+1])})

    jsondumped = json.dumps({'nodes':nodes, 'links':links}, indent=2)

    jsonfile.write(jsondumped)
Example #13
0
	def removeRedundantFrames(self):
		h, w, d = self.keyframes[0].shape
		n = len(self.keyframes)
		frames = np.zeros((n, 256))
		self.frameHistFeats
		for i, kf in enumerate(self.keyframes):
			frames[i] = tools.getColorHist(kf).ravel()
		
		k = int(np.sqrt(n))
		kmeans = KMeans(k)
		print("Clustering frames into {0} code vectors.".format(k))
		kmeans.fit(self.frameHistFeats)

		bestFrameIndices = []
		bestFrames = []
		NN = NearestNeighbors(1)
		NN.fit(frames)
		centers = kmeans.cluster_centers_
		for center in centers:
			nearest = NN.kneighbors(center, return_distance=False)
			bestFrameIndices.append(nearest[0])
		bestFrameIndices.sort()
		for i in bestFrameIndices:
			bestFrames.append(self.keyframes[i])
		return bestFrames
Example #14
0
    def move(self, event):
        # add the knn scheme to decide selected region when moving mouse

        if SKLEARN_INSTALLED:
            if event.button == 1 and event.is_dragging:

                # TODO: support multiple datasets here
                data = get_map_data_scatter(self.active_layer_artist.layer,
                                            self.active_layer_artist.visual,
                                            self._vispy_widget)

                # calculate the threshold and call draw visual
                width = event.pos[0] - self.selection_origin[0]
                height = event.pos[1] - self.selection_origin[1]
                drag_distance = math.sqrt(width**2 + height**2)
                canvas_diag = math.sqrt(self._vispy_widget.canvas.size[0]**2 +
                                        self._vispy_widget.canvas.size[1]**2)

                mask = np.zeros(self.active_layer_artist.layer.shape)

                # neighbor num proportioned to mouse moving distance
                n_neighbors = drag_distance / canvas_diag * self.active_layer_artist.layer.shape[0]
                if n_neighbors >= 1:
                    neigh = NearestNeighbors(n_neighbors=n_neighbors)
                    neigh.fit(data)
                    select_index = neigh.kneighbors([self.selection_origin])[1]
                    mask[select_index] = 1
                self.mark_selected(mask, self.active_layer_artist.layer)
Example #15
0
def adasyn_sample(X,Y,minclass,K=5,n=200):
    indices = np.nonzero(Y==minclass)
    Ymin = Y[indices]
    Xmin = X[indices]
    Cmin = len(indices[0])
    Xs = []
    if n > Cmin:
        Xs.append(Xmin)
        n -= len(Ymin)
    else:
        # simple random without replacement undersampling
        return Xmin[random.sample(range(Cmin),n)]
    neigh = NearestNeighbors(n_neighbors=30)
    neigh.fit(X)
    nindices = neigh.kneighbors(Xmin,K,False)
    gamma = [float(sum(Y[i]==minclass))/K for i in nindices]
    gamma = gamma / np.linalg.norm(gamma,ord = 1)
    neigh = NearestNeighbors(n_neighbors=30)
    neigh.fit(Xmin)
    N = np.round(gamma*n).astype(int)
    assert len(N) == Cmin
    for (i,nn) in enumerate(N):
        nindices = neigh.kneighbors(Xmin[i],K,False)[0]
        for j in range(nn):
            alpha = random.random()
            Xnn = X[random.choice(nindices)]
            Xs.append((1.-alpha)*Xmin[i]+alpha*Xnn)
    Xadasyn = sparse.vstack(Xs)  
    return Xadasyn
Example #16
0
def createSyntheticSamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel): 
    (Xminority,Xmajority) = partitionSamples(X,Y)
    numFeatures = Xminority.shape[1]
    Xreduced = pca(Xminority)
    numOrigMinority=len(Xminority)
    #reducedMinoritykmeans = KMeans(init='k-means++', max_iter=500,verbose=False,tol=1e-4,k=numCentroids, n_init=5, n_neighbors=3).fit(Xreduced)
    reducedNN = NearestNeighbors(nearestneigh, algorithm='auto')
    reducedNN.fit(Xreduced)
    #Xsyn=np.array([numOrigMinority,numNeighbors*numFeatures])
    trylist=[]
    #LOOPHERE  for EACH (minority) point...
    for i,row in enumerate(Xreduced):
        neighbor_index = reducedNN.kneighbors(row, return_distance=False) 
        closestPoints = Xminority[neighbor_index]
        #randomly choose one of the k nearest neighbors
        chosenNeighborsIndex = chooseNeighbor(neighbor_index,numNeighbors,i)
        chosenNeighbor = Xminority[chosenNeighborsIndex]
        #Calculate linear combination:        
        #Take te difference between the orig minority sample and its selected neighbor, where X[1,] is the orig point
        diff = Xminority[i,]-chosenNeighbor
        #Multiply this difference by a number between 0 and 1
        r = random.uniform(0,1)
        #Add it back to te orig minority vector and viola this is the synthetic sample
        syth_sample =Xminority[i,:]+r*diff
        syth_sample2 = syth_sample.tolist()
        trylist.append(syth_sample2)
    Xsyn=np.asarray(trylist).reshape(numNeighbors*numOrigMinority,numFeatures)
    maj_col=majoritylabel*np.ones([Xmajority.shape[0],1])
    min_col=minoritylabel*np.ones([Xsyn.shape[0],1])
    syth_Y = np.concatenate((maj_col,min_col),axis=0)
    syth_X = np.concatenate((Xmajority,Xsyn),axis=0)
    if(syth_X.shape[0]!=syth_Y.shape[0]):
        raise Exception("dim mismatch between features matrix and response matrix")
    return (syth_X, syth_Y)
class ContentBased(object):
    """
    Modelo de recomendación de articulos basados en los tags con mas relevancia de cada uno de ellos.
    El modelo vectoriza cada articulo para poder calcular la similitud entre cada uno de ellos. 
    """
    def __init__(self, stop_words=None, token_pattern=None, metric='cosine', n_neighbors=5):
        if stop_words is None:
            stop_words =  stopwords.words("english")
            
        if token_pattern is None:
            token_pattern = '(?u)\\b[a-zA-Z]\\w\\w+\\b'
            
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=token_pattern)
        self.nearest_neigbors = NearestNeighbors(metric=metric, n_neighbors=n_neighbors, algorithm='brute')
        
    def fit(self, datos, columna_descripcion):
        """
        Entrenamos el modelo:
        1/ Vectorizacion de cada articulo (Extracción y ponderación de atributos)
        2/ Calculamos los articulos mas cercanos
        """
        self.datos = datos
        datos_por_tags = self.tfidf_vectorizer.fit_transform(datos[columna_descripcion])        
        self.nearest_neigbors.fit(datos_por_tags)
        
    def predict(self, descripcion):
        """
        Devuelve los articulos mas parecidos a la descripcion propuesta
        """
        descripcion_tags = self.tfidf_vectorizer.transform(descripcion)        
        if descripcion_tags.sum() == 0:
            return pd.DataFrame(columns=self.datos.columns)
        else:
            _, indices = self.nearest_neigbors.kneighbors(descripcion_tags)
            return self.datos.iloc[indices[0], :]
Example #18
0
File: nn.py Project: tianfudhe/ids
class NNScope:

    def get_minolab(self):
        tmp = pd.Series(self.y)
        tmp = tmp.value_counts()
        return min(tmp.keys(), key=lambda o: tmp[o])

    def normalization(self):
        self.X -= np.mean(self.X, axis=0)
        self.X /= np.sqrt(np.var(self.X, axis=0))

    def __init__(self, X, y, k):
        self.X = np.array(X, dtype='float64')
        self.normalization()
        self.y = y
        self.minolab = self.get_minolab()
        self.nn = NearestNeighbors(n_neighbors=k, n_jobs=-1)
        self.nn.fit(self.X)
        self.nn_maj = NearestNeighbors(n_neighbors=k, n_jobs=-1)
        self.nn_maj.fit(self.X[y != self.minolab])
        self.distr = None

    # how many minority samples with given number of minotiry neighbors
    def calc_ratio(self):
        dis_all, _ = self.nn.kneighbors()
        dis_all = dis_all[self.y == self.minolab]
        dis_maj, _ = self.nn_maj.kneighbors(self.X[self.y == self.minolab])
        self.WBNR = np.sqrt(np.mean(dis_all ** 2, axis=1) /
                            np.mean(dis_maj ** 2, axis=1))

    def show_ratio_distr(self):
        plt.hist(self.WBNR, bins=20)
Example #19
0
class KDTrees:

    def __init__(self, nb_neighbours, leaf_size):
        self.nbrs = NearestNeighbors(n_neighbors=nb_neighbours, algorithm='ball_tree', metric = 'haversine', leaf_size=leaf_size)
    # Compute distance in time between two points on the map
    def mapDistance(self, x, y):
        if (len(x) > 2):
            return np.sum((x - y) ** 2)
        else:
            if(x[0] < y[0]):
                tmp = y
                y = x
                x = tmp
            pos1 = str(x[0]) + ", " + str(x[1])
            pos2 = str(y[0]) + ", " + str(y[1])
            timestamp = datetime.now()
            sec_to_add = 32 * 3600 + (timestamp - datetime(1970, 1, 1)).total_seconds() - 2*3600 - timestamp.hour * 3600 - timestamp.minute * 60 - timestamp.second
            traject = gmaps.directions(pos1, pos2, mode="transit", departure_time=timestamp.fromtimestamp(sec_to_add))
            try:
                print 'ok'
                return (traject[0]["legs"][0]["arrival_time"]["value"] - traject[0]["legs"][0]["departure_time"]["value"])
            except:
                print 'bug'
                return 1000000000


    def addPoints(self, points):
        self.nbrs.fit(points)

    def getNeighbours(self, points):
        self.nbrs.kneighbors(points)
Example #20
0
	def fit(self, x, y):
		# calculate sample ratio
		class_numsamples_dict = {}
		class_samples_counts = y.value_counts()
		max_samples = class_samples_counts.iloc[0]
		class_list = y.unique()
		for c in class_list:
			num_samples = class_samples_counts[c]
			class_numsamples_dict[c] = float(max_samples-num_samples)/max_samples
		sample_ratio = []
		for index, value in y.iteritems():
			c = value
			sample_ratio.append(class_numsamples_dict[c])

		# calculate knn ratio
		x = x.reset_index()
		y = y.reset_index()
		nn = NN(n_neighbors=int(self.k))
		nn.fit(x)
		knn_ratio = []
		for index, value in x.iterrows():
			if index % 100 == 0:
				print index
			num_sameclass = 0
			distance, indices = nn.kneighbors(value)
			relevant_indices = indices[0][1:]
			for ri in relevant_indices:
				if y.loc[ri].values[1] == y.loc[index].values[1]:
					num_sameclass += 1
			knn_ratio.append(num_sameclass / float(self.k))
		plt.hist(knn_ratio)
		plt.show()
Example #21
0
def nearestN():
    X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]]
#    y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ]
    model = NN(n_neighbors=1, radius=1)
    model.fit(X)
    y = [98.,0.]
    print model.kneighbors(y)
Example #22
0
def generateSyntheticRecords(T, inds, k, similarity, neigh=None):
    n_minority_samples, n_features = T.shape
    k = min(k, round(n_minority_samples - 1))
    k = max(k, 2)

    n_synthetic_samples = len(inds)
    S = np.zeros(shape=(n_synthetic_samples, n_features))
    
    # Learn nearest neighbours
    if (neigh is None): 
        neigh = NearestNeighbors(n_neighbors=k)
        neigh.fit(T)

    # Calculate synthetic samples
    for synInd in range(n_synthetic_samples):
        i = choice(range(n_minority_samples))
        dists, nn = neigh.kneighbors(T[i], return_distance=True)
        dists, nn = dists[0], list(nn[0])
        # if some records are duplicated, i sometimes won't be in nn
        if (i in nn): 
            nn.remove(i)   
            dists = dists[1:]
        if (len(nn) > 0): 
            nn_index = choice(nn)
            if (nn_index == i): raise Exception('erorr in smote!')
        else:
            nn_index = i  # duplicate i                
        dif = T[nn_index] - T[i]
        gap = np.random.random() * similarity
        S[synInd, :] = T[i, :] + gap * dif[:]
    return S
	def select(self, x_test, metric='minkowski', p=2):
		"""
		Dynamically select classifiers in the pool relatively to a test
		pattern x_test

		Parameters
		----------
		x_test : test pattern 

		Returns
		----------
		best classifier : best classifier according to the dynamic selection scheme.
		"""
		pool = self.ensemble_clf.estimators_
		if not pool:
			raise ValueError("Fit the ensemble methiod before throwing it to \
							  the dynamic selection algorithm")

		predicted_labels = [clf.predict(x_test.reshape(1, -1)) for clf in pool]

		if len(np.unique(predicted_labels)) == 1:
			# All the classifiers agree on the predicted class
			return pool[0]
		else:
			knn = NearestNeighbors(n_neighbors=self.knn, metric=metric, p=p)
			knn.fit(self.X_val)
			iknn = knn.kneighbors(x_test.reshape(1, -1), return_distance=False)[0]
			X_knn, y_knn = self.X_val[iknn], self.y_val[iknn]

			accuracies = [accuracy_score(clf.predict(X_knn), y_knn) \
						  for clf in pool]
			i_best = np.argmax(accuracies)

			return pool[i_best]
Example #24
0
def draw_voronoi(ax,rrt):
    xr = ax.get_xlim()
    yr = ax.get_xlim()
    
    xres = 500
    yres = 500

    xs= np.linspace(xr[0],xr[1],xres)
    ys= np.linspace(yr[0],yr[1],yres)
    
    grid = np.array(np.meshgrid(xs,ys))
    grid = grid.reshape((2,-1))
    grid = grid.T
    #grid is a 2-by-(xres * yres) array. we want the nearest node for each of those (xres*yres) points

    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(algorithm='kd_tree',n_neighbors=1)
    nodes = np.array(rrt.tree.nodes())
    states = np.array([rrt.tree.node[i]['state'] for i in nodes])
    nn.fit(states,nodes)
    nn_res = nn.kneighbors(grid,return_distance=False)
    nn_res = nn_res.reshape((xres,yres))
    nn_res = np.array(nn_res,dtype=np.float)
    
    #ns_res is an xres-by-yres array and contains the node-id of the nearest neighbor at each [i,j]
    print 'regions' , np.unique(nn_res)
    if np.max(nn_res)> 0: nn_res /= float(np.max(nn_res))   #normalize for color map. this is a stupid way to assign colors to regions
    ax.imshow(nn_res,origin='lower',extent=[xr[0],xr[1],yr[0],yr[1]],alpha=.5,zorder=2,cmap=mpl.cm.get_cmap(name='prism'))    
    ax.figure.canvas.draw()
class SimilaritySearch():

    def euclidean(self, x, y):
        return np.sum((x-y)**2)
    
    #no normalization
    def intersection(self, x, y):
        return np.sum(x) - np.sum(np.minimum(x,y))

    def __init__(self, k=C.KNN_DEFAULT, data=None, labels=None):
        self.knn = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric='pyfunc', func=self.intersection)
        #self.knn = NearestNeighbors(n_neighbors=k, algorithm='ball_tree', metric='minkowski')
        self.k = k
        if not data is None:
            self.data = data
            self.labels = labels
            self.train()
        else:
            self.data = []
            self.labels = []

    def addHistogram(self, hist, label):
        self.data.append(hist)
        self.labels.append(label)

    def train(self):
        self.knn.fit(np.array(self.data), np.array(self.labels))
    
    def findk(self, hist):
        dist, neigh = self.knn.kneighbors(np.array([hist]))
        return neigh, dist
Example #26
0
    def on_mouse_move(self, event):
        # add the knn scheme to decide selected region when moving mouse
        super(ScatterSelectionToolbar, self).on_mouse_move(event=event)

        if SKLEARN_INSTALLED:
            if event.button == 1 and event.is_dragging and self.mode is 'point':
                visible_data, visual = self.get_visible_data()
                data = self.get_map_data()

                visible_data = np.nan_to_num(visible_data)

                # calculate the threshold and call draw visual
                width = event.pos[0] - self.selection_origin[0]
                height = event.pos[1] - self.selection_origin[1]
                drag_distance = math.sqrt(width**2+height**2)
                canvas_diag = math.sqrt(self._vispy_widget.canvas.size[0]**2
                                        + self._vispy_widget.canvas.size[1]**2)

                # neighbor num proportioned to mouse moving distance
                n_neighbors = drag_distance / canvas_diag * visible_data[0].data.shape[0]
                neigh = NearestNeighbors(n_neighbors=n_neighbors)
                neigh.fit(data)
                select_index = neigh.kneighbors([self.selection_origin])[1]

                mask = np.zeros(visible_data[0].data.shape)
                mask[select_index] = 1
                self.mark_selected(mask, visible_data)
 def rrt(self):
     """
     Basic RRT Algorithm
     """
     probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y])
     V = [probot]
     E = {}
     nbrs = NearestNeighbors(n_neighbors=1)
     nbrs.fit(V)
     t1 = time.time()
     rrt_iter = 0
     while rrt_iter < self._max_rrt_iterations:
         prand = self.sample_free_uniform()
         (dist, idx) = nbrs.kneighbors(prand)
         idx = idx.flatten(1)[0]
         if dist < self._rrt_eta:
             pnew = prand
         else:
             pnew = self.steer(V[idx], prand)
         if self.segment_safe(V[idx],pnew) is True:
             if E.has_key(idx):
                 E[idx].append(len(V))
             else:
                 E[idx] = [len(V)]
             V.append(pnew)
             nbrs.fit(V)
         rrt_iter += 1
     print 'total time: ', time.time()-t1
     self.publish_rrt(V,E) 
def load_data_with_SMOTE():
    rawdata = read_file()
    size = 150
    small = rawdata[rawdata['class'] == 'B']
    n_sample = small.shape[0]
    idx = np.random.randint(0, n_sample, size)
    X = small.iloc[idx, range(1, 5)].values
    y = small.iloc[idx, 0].values
    knn = NearestNeighbors(n_neighbors=2)
    knn.fit(X)
    _d, i = knn.kneighbors(X)
    idx2 = i[:, 1]
    diff = X - X[idx2]
    X = X + np.random.random(4) * diff
    B = np.concatenate([np.transpose(y[np.newaxis]), X], axis=1)
    B = pd.DataFrame(B)

    n_sample = rawdata[rawdata['class'] == 'L'].shape[0]
    idx = np.random.randint(0, n_sample, size)
    L = rawdata[rawdata['class'] == 'L'].iloc[idx]

    n_sample = rawdata[rawdata['class'] == 'R'].shape[0]
    idx = np.random.randint(0, n_sample, size)
    R = rawdata[rawdata['class'] == 'R'].iloc[idx]

    d = np.concatenate([B.values, L.values, R.values])

    le = LabelEncoder()
    X = d[:, 1:5]
    y = le.fit_transform(d[:, 0])
    return X, y
Example #29
0
def pts_to_surface(skel, im_depth, thresh=10):
	'''
	Ensures that the joint positions lie within the silhouette of the person
	---Parameters---
	skel : should be in image coordinates
	im_depth : should be masked
	thresh : if distance is too large, don't move!
	---Return---
	The same skeleton where all joints are within the mask
	'''

	height, width = im_depth.shape
	skel = np.array([[max(min(p[0], width-1), 0), max(min(p[1], height-1), 0), p[2]] for p in skel] )
	out_of_bounds = np.where(np.array([im_depth[p[1],p[0]] for p in skel]) == 0)[0]

	# embed()
	# If pixel if outside of mask, find the closest 'in' neighbor
	if len(out_of_bounds) > 0:
		from sklearn.neighbors import NearestNeighbors
		NN = NearestNeighbors(n_neighbors=1)
		inds = np.array(np.nonzero(im_depth)).T
		NN.fit(inds)

		for i in out_of_bounds:
			pos = skel[i]
			closest_ind = NN.kneighbors([pos[1],pos[0]], 1, return_distance=False)[0]
			closest_pos = inds[closest_ind][0]
			if np.linalg.norm(closest_pos[:2]-pos[:2], 2) < thresh:
				skel[i][0] = closest_pos[1]
				skel[i][1] = closest_pos[0]
			else:
				skel[i][0] = 0
				skel[i][1] = 0

	return skel
Example #30
0
def knn(x, i, k, exIndices):
    k += len(exIndices)
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(x)
    nn = list(neigh.kneighbors(x[i], return_distance=False)[0])
    nn = list(set(nn) - set(exIndices))
    return nn
def calculate_p_mass(embedding,
                     smooth=0.5,
                     steps=(40, 40),
                     n_neighbors=100,
                     n_jobs=4,
                     xylim=((None, None), (None, None))):
    """Calculate the velocity using a points on a regular grid and a gaussian kernel

    Note: the function should work also for n-dimensional grid

    Arguments
    ---------
    embedding:

    smooth: float, smooth=0.5
        Higher value correspond to taking in consideration further points
        the standard deviation of the gaussian kernel is smooth * stepsize
    steps: tuple, default
        the number of steps in the grid for each axis
    n_neighbors:
        number of neighbors to use in the calculation, bigger number should not change too much the results..
        ...as soon as smooth is small
        Higher value correspond to slower execution time
    n_jobs:
        number of processes for parallel computing
    xymin:
        ((xmin, xmax), (ymin, ymax))

    Returns
    -------
    total_p_mass: np.ndarray
        density at each point of the grid

    """

    # Prepare the grid
    grs = []
    for dim_i in range(embedding.shape[1]):
        m, M = np.min(embedding[:, dim_i]), np.max(embedding[:, dim_i])

        if xylim[dim_i][0] is not None:
            m = xylim[dim_i][0]
        if xylim[dim_i][1] is not None:
            M = xylim[dim_i][1]

        m = m - 0.025 * np.abs(M - m)
        M = M + 0.025 * np.abs(M - m)
        gr = np.linspace(m, M, steps[dim_i])
        grs.append(gr)

    meshes_tuple = np.meshgrid(*grs)
    gridpoints_coordinates = np.vstack([i.flat for i in meshes_tuple]).T

    nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
    nn.fit(embedding)
    dists, neighs = nn.kneighbors(gridpoints_coordinates)

    std = np.mean([(g[1] - g[0]) for g in grs])
    # isotropic gaussian kernel
    gaussian_w = normal.pdf(loc=0, scale=smooth * std, x=dists)
    total_p_mass = gaussian_w.sum(1)
    gridpoints_coordinates

    return total_p_mass, gridpoints_coordinates
import numpy as np

from sklearn.datasets import load_digits
from sklearn.neighbors import NearestNeighbors

# Set random seed for reproducibility
np.random.seed(1000)

if __name__ == '__main__':
    # Load the dataset
    digits = load_digits()
    X_train = digits['data'] / np.max(digits['data'])

    # Perform kNN
    knn = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')
    knn.fit(X_train)

    # Query the model
    distances, neighbors = knn.kneighbors(X_train[100].reshape(1, -1),
                                          return_distance=True)

    print('Distances: {}'.format(distances[0]))

    # Plot the neighbors
    fig, ax = plt.subplots(5, 10, figsize=(8, 8))

    for y in range(5):
        for x in range(10):
            idx = neighbors[0][(x + (y * 10))]
            ax[y, x].matshow(digits['images'][idx], cmap='gray')
            ax[y, x].set_xticks([])
Example #33
0
distance_to_each_cluster = KM.transform(X[film_index])
clusters_by_distance = distance_to_each_cluster.argsort()[0]
nb_results_found = 0
distances = []
neighbors_indexes = []
for cluster in clusters_by_distance:
    if nb_results_found < nb_results + 1:
        print('--> Looking in cluster ' + str(cluster) + ' (' +
              str(nb_results_found) + ' results found yet)')
        indexes_of_cluster = np.where(labels == cluster)[0]
        indexes = np.intersect1d(indexes_of_cluster, indexes_fitting_filters)
        samples = X[list(indexes), :]
        neigh = NearestNeighbors(n_neighbors=(nb_results - nb_results_found) +
                                 1,
                                 p=p_norm)
        neigh.fit(samples)
        (loc_distances,
         loc_neighbors_indexes) = neigh.kneighbors(X[film_index])
        local_nb_results_found = loc_distances[0].shape[0]
        print('--> Found ' + str(local_nb_results_found) +
              ' results in this cluster')
        nb_results_found = nb_results_found + local_nb_results_found
        distances.append(loc_distances[0])
        neighbors_indexes.append(indexes[loc_neighbors_indexes[0]])
neighbors_indexes = np.concatenate(neighbors_indexes)
distances = np.concatenate(distances)

distances = distances[
    1:]  # because the film being studied is the closest neighbor...
neighbors_indexes = neighbors_indexes[1:]
Example #34
0
def main():
    st.image('logo.png', width=400)
    st.title('AceleraDev Data Science - Projeto Final')
    st.subheader('Ronaldo Regis Posser - Sistema de recomendação de novos clientes')
    
    
    label_enc = LabelEncoder()

    base = pd.read_csv('base.zip')
    base['id'] = label_enc.fit_transform(base['id'])
    
    st.subheader('Visualização base de dados')

    st.markdown('A base possui ' + str(base.shape[0]) + ' empresas, com ' + str(base.shape[1]) + ' variáveis que diferenciam as empresas por faturamento, ramo de atividade e localização.')
    st.dataframe(base.head())

    
    base.groupby('sg_uf')['setor'].apply(pd.Series.value_counts).unstack().plot.bar(figsize = (10,5))
    plt.title('Distribuição dos setores nos estados')
    plt.xticks(rotation='horizontal')
    plt.xlabel('Estados')
    st.pyplot()
    
    dict_porte = {'DE R$ 1.500.000,01 A R$ 4.800.000,00' : 'Pequena',
       'DE R$ 81.000,01 A R$ 360.000,00': 'Micro',
       'ATE R$ 81.000,00': 'Micro',
       'SEM INFORMACAO': 'Micro',
       'DE R$ 360.000,01 A R$ 1.500.000,00': 'Pequena',
       'DE R$ 10.000.000,01 A R$ 30.000.000,00': 'Média',
       'DE R$ 4.800.000,01 A R$ 10.000.000,00': 'Média',
       'DE R$ 30.000.000,01 A R$ 100.000.000,00': 'Grande',
       'DE R$ 300.000.000,01 A R$ 500.000.000,00': 'Grande',
       'DE R$ 100.000.000,01 A R$ 300.000.000,00': 'Grande',
       'ACIMA DE 1 BILHAO DE REAIS': 'Grande',
       'DE R$ 500.000.000,01 A 1 BILHAO DE REAIS': 'Grande'}
    
    base_aux = base[['setor', 'de_faixa_faturamento_estimado']]
    base_aux['porte'] = base_aux['de_faixa_faturamento_estimado'].map(dict_porte)

    base_aux.groupby('setor')['porte'].apply(pd.Series.value_counts).unstack().plot.bar(figsize = (10,5), log=True)
    plt.title('Distribuição dos portes de empresa por setores')
    plt.xticks(rotation='horizontal')
    plt.xlabel('Estados')
    st.pyplot()

    #Engenharia de features
    base.set_index(['id'], inplace=True)

    features_cat = ['sg_uf',	'setor', 'idade_emp_cat', 'nm_divisao', 'de_saude_tributaria',
                'de_saude_rescencia', 'de_nivel_atividade', 'nm_meso_regiao',
                'nm_micro_regiao', 'de_faixa_faturamento_estimado']
    base_dummies = pd.get_dummies(base, columns=features_cat)

    #Treinamento modelo
    qtd_neighbors = 5
    model = NearestNeighbors(n_neighbors=qtd_neighbors, metric = 'cosine')
    model.fit(base_dummies)

    #Gerando sugestões com base em uma portfólio
    st.subheader('Recomendação de novos clientes')
    st.markdown('Escolha o arquivo com o portfólio que deseja analisar (.csv)')
    file  = st.file_uploader(' ',type = 'csv')
    
    if file is not None:
        portfolio = pd.read_csv(file)
        portfolio['id'] = label_enc.transform(portfolio['id'])
        portfolio.set_index(['id'], inplace=True)
        portfolio = base.loc[portfolio.index.to_list()]

        st.markdown('A portfólio possui ' + str(portfolio.shape[0]) + ' empresas.')
        
        #Visualização dados Portfólio
        portfolio.groupby('sg_uf')['setor'].apply(pd.Series.value_counts).unstack().plot.bar(figsize = (10,5))
        plt.title('Portfólio - Distribuição dos setores nos estados')
        plt.xticks(rotation='horizontal')
        plt.xlabel('Estados')
        st.pyplot()

        port_aux = portfolio[['setor', 'de_faixa_faturamento_estimado']]
        port_aux['porte'] = port_aux['de_faixa_faturamento_estimado'].map(dict_porte)

        port_aux.groupby('setor')['porte'].apply(pd.Series.value_counts).unstack().plot.bar(figsize = (10,5), log=True)
        plt.title('Portfólio - Distribuição dos portes de empresa por setores')
        plt.xticks(rotation='horizontal')
        plt.xlabel('Estados')
        st.pyplot()

        portfolio_train = base_dummies.loc[portfolio.index.to_list()]

        previsao_port = model.kneighbors(portfolio_train, return_distance=False)

        leads_port = base.iloc[previsao_port.reshape(-1)]

        leads_port.reset_index(inplace=True)

        st.markdown('Primeiras 20 sugestões geradas pelo modelo.\n As marcadas em azul são clientes atuais e as quatro seguintes as sugestões de novos clientes.')
        st.dataframe(leads_port.head(20).style.apply(lambda x: ['background: #5cade2' if (x.name % 5 == 0) else ' ' for i in x], 
                    axis=1))

        
        #Adicionando Latitude e Longitude ao conjunto de dados previstos para visualização no mapa
        lat_long = pd.read_csv('lat_long_micro.csv')
        leads_port['lat'] = leads_port['nm_micro_regiao'].apply(lambda micro: lat_long[lat_long['nm_micro'] == micro]['lat'].values[0])
        leads_port['lng'] = leads_port['nm_micro_regiao'].apply(lambda micro: lat_long[lat_long['nm_micro'] == micro]['lng'].values[0])
        leads_port.reset_index(inplace=True)

        st.markdown('Mapa 30 primeiras recomendações')
        st.markdown('Marcadores vermelhos são atuais clientes. Marcadores azuis são recomendações.')
        st.pydeck_chart(create_map_deck(leads_port))
Example #35
0
class KNN(BaseDetector):
    # noinspection PyPep8
    """kNN class for outlier detection.
    For an observation, its distance to its kth nearest neighbor could be
    viewed as the outlying score. It could be viewed as a way to measure
    the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for
    details.

    Three kNN detectors are supported:
    largest: use the distance to the kth neighbor as the outlier score
    mean: use the average of all k neighbors as the outlier score
    median: use the median of the distance to k neighbors as the outlier score

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for k neighbors queries.

    method : str, optional (default='largest')
        {'largest', 'mean', 'median'}

        - 'largest': use the distance to the kth neighbor as the outlier score
        - 'mean': use the average of all k neighbors as the outlier score
        - 'median': use the median of the distance to k neighbors as the
          outlier score

    radius : float, optional (default = 1.0)
        Range of parameter space to use by default for `radius_neighbors`
        queries.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use BallTree
        - 'kd_tree' will use KDTree
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or KDTree.  This can affect the
        speed of the construction and query, as well as the memory
        required to store the tree.  The optimal value depends on the
        nature of the problem.

    metric : string or callable, default 'minkowski'
        metric to use for distance computation. Any metric from scikit-learn
        or scipy.spatial.distance can be used.

        If metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays as input and return one value indicating the
        distance between them. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Distance matrices are not supported.

        Valid values for metric are:

        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
          'manhattan']

        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
          'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
          'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
          'sqeuclidean', 'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics.

    p : integer, optional (default = 2)
        Parameter for the Minkowski metric from
        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
        See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
        Affects only kneighbors and kneighbors_graph methods.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, contamination=0.1, n_neighbors=5, method='largest',
                 radius=1.0, algorithm='auto', leaf_size=30,
                 metric='minkowski', p=2, metric_params=None, n_jobs=1,
                 **kwargs):
        super(KNN, self).__init__(contamination=contamination)
        self.n_neighbors = n_neighbors
        self.method = method
        self.radius = radius
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

        self.neigh_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       radius=self.radius,
                                       algorithm=self.algorithm,
                                       leaf_size=self.leaf_size,
                                       metric=self.metric,
                                       p=self.p,
                                       metric_params=self.metric_params,
                                       n_jobs=self.n_jobs,
                                       **kwargs)

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """

        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.tree_ = KDTree(X, leaf_size=self.leaf_size, metric=self.metric)
        self.neigh_.fit(X)

        dist_arr, _ = self.neigh_.kneighbors(n_neighbors=self.n_neighbors,
                                             return_distance=True)
        dist = self._get_dist_by_method(dist_arr)

        self.decision_scores_ = dist.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['tree_', 'decision_scores_',
                               'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        pred_scores = np.zeros([X.shape[0], 1])

        for i in range(X.shape[0]):
            x_i = X[i, :]
            x_i = np.asarray(x_i).reshape(1, x_i.shape[0])

            # get the distance of the current point
            dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors)
            dist = self._get_dist_by_method(dist_arr)
            pred_score_i = dist[-1]

            # record the current item
            pred_scores[i, :] = pred_score_i

        return pred_scores.ravel()

    def _get_dist_by_method(self, dist_arr):
        """Internal function to decide how to process passed in distance array

        Parameters
        ----------
        dist_arr : numpy array of shape (n_samples, n_neighbors)
            Distance matrix.

        Returns
        -------
        dist : numpy array of shape (n_samples,)
            The outlier scores by distance.
        """

        if self.method == 'largest':
            return dist_arr[:, -1]
        elif self.method == 'mean':
            return np.mean(dist_arr, axis=1)
        elif self.method == 'median':
            return np.median(dist_arr, axis=1)
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query(
    'totalRatingCount >= @popularity_threshold')
#print(rating_popular_movie.head())

# creating the pivot table & filling na with 0 value
movie_features_df = rating_popular_movie.pivot_table(index='title',
                                                     columns='userId',
                                                     values='rating').fillna(0)
#print(movie_features_df.head())

# now crating the pivate matrix from the pivot table
movie_features_df_matrix = csr_matrix(movie_features_df.values)

# using nearest neighbours
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(movie_features_df_matrix)

query_index = np.random.choice(movie_features_df.shape[1])
distance, indices = knn_model.kneighbors(
    movie_features_df.iloc[query_index, :].values.reshape(1, -1),
    n_neighbors=6)

for i in range(0, len(distance.flatten())):
    if i == 0:
        print('Recommedation for {0}:\n'.format(
            movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with disatnce of {2}'.format(
            i, movie_features_df.index[indices.flatten()[i]],
            distance.flatten()[i]))
def EpsDBSCAN(D, k):
    # Find the optimal eps based on the gradient of Nearest Neighbor
    nearest = NearestNeighbors(n_neighbors=k+1)
    nearest.fit(D)
    distances, indices = nearest.kneighbors(D)
    distances = np.delete(distances, 0, 1)
    Dist = distances.max(axis=1)
    #Array = sorted(Dist)
    AvgDist = distances.sum(axis=1)/k
    Avg_Array = sorted(AvgDist)

    N = len(Avg_Array)
    norm_Array = [0.0 for i in range(N)]
    minArray = min(Avg_Array)
    maxArray = max(Avg_Array)

    # normalization
    for i in range(N):
        norm_Array[i] = (Avg_Array[i]-minArray)/(maxArray-minArray)*(1.0-0.0)

    # binning
    bins = np.linspace(0, 1, 10)
    bin_indice = np.digitize(norm_Array, bins)
    Eps = []
    Avg_Array = np.array(Avg_Array)

    for i in range(10):
        count = len(np.where(bin_indice == i)[0])
        if count >= k:
            e = np.sum(Avg_Array[bin_indice == i], axis=0)/count
            Eps.append(e)

    N = len(Eps)
    num = len(Avg_Array)
    Eps_index = []

    # find Avg_Array index over each Eps value
    for i in range(N):
        for j in range(num):
            if Avg_Array[j] > Eps[i]:
                Eps_index.append(j)
                break

    ave_slope = (maxArray - minArray)/N
    Slopes = []
    
    # caluculate slope of Eps value
    for i in range(N-1):
        slope = (Eps[i+1] - Eps[i]) / (Eps_index[i+1] - Eps_index[i])
        Slopes.append(slope)

    ave_slope = sum(Slopes)/len(Slopes)

    # find the point over average slope
    for i in range(N-1):
        if i > 0 and Slopes[i] > ave_slope:
            out = Eps[i]
            break
        else:
            out = Eps[i+1]

    return out
Example #38
0
def test_nearest_neighbors_execution(setup):
    rs = np.random.RandomState(0)
    raw_X = rs.rand(10, 5)
    raw_Y = rs.rand(8, 5)

    X = mt.tensor(raw_X, chunk_size=7)
    Y = mt.tensor(raw_Y, chunk_size=(5, 3))

    for algo in ['brute', 'ball_tree', 'kd_tree', 'auto']:
        for metric in ['minkowski', 'manhattan']:
            nn = NearestNeighbors(n_neighbors=3,
                                  algorithm=algo,
                                  metric=metric)
            nn.fit(X)

            ret = nn.kneighbors(Y)

            snn = SkNearestNeighbors(n_neighbors=3,
                                     algorithm=algo,
                                     metric=metric)
            snn.fit(raw_X)
            expected = snn.kneighbors(raw_Y)

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

            if nn._tree is not None:
                assert isinstance(nn._tree.fetch(), type(snn._tree))

            # test return_distance=False
            ret = nn.kneighbors(Y, return_distance=False)

            result = ret.fetch()
            np.testing.assert_almost_equal(result, expected[1])

            # test y is x
            ret = nn.kneighbors()

            expected = snn.kneighbors()

            result = [r.fetch() for r in ret]
            np.testing.assert_almost_equal(result[0], expected[0])
            np.testing.assert_almost_equal(result[1], expected[1])

            # test y is x, and return_distance=False
            ret = nn.kneighbors(return_distance=False)

            result = ret.fetch()
            np.testing.assert_almost_equal(result, expected[1])

    # test callable metric
    metric = lambda u, v: np.sqrt(((u-v)**2).sum())
    for algo in ['brute', 'ball_tree']:
        nn = NearestNeighbors(n_neighbors=3,
                              algorithm=algo,
                              metric=metric)
        nn.fit(X)

        ret = nn.kneighbors(Y)

        snn = SkNearestNeighbors(n_neighbors=3,
                                 algorithm=algo,
                                 metric=metric)
        snn.fit(raw_X)
        expected = snn.kneighbors(raw_Y)

        result = [r.fetch() for r in ret]
        np.testing.assert_almost_equal(result[0], expected[0])
        np.testing.assert_almost_equal(result[1], expected[1])

    # test sparse
    raw_sparse_x = sps.random(10, 5, density=0.5, format='csr', random_state=rs)
    raw_sparse_y = sps.random(8, 5, density=0.4, format='csr', random_state=rs)

    X = mt.tensor(raw_sparse_x, chunk_size=7)
    Y = mt.tensor(raw_sparse_y, chunk_size=5)

    nn = NearestNeighbors(n_neighbors=3)
    nn.fit(X)

    ret = nn.kneighbors(Y)

    snn = SkNearestNeighbors(n_neighbors=3)
    snn.fit(raw_sparse_x)
    expected = snn.kneighbors(raw_sparse_y)

    result = [r.fetch() for r in ret]
    np.testing.assert_almost_equal(result[0], expected[0])
    np.testing.assert_almost_equal(result[1], expected[1])

    # test input with unknown shape
    X = mt.tensor(raw_X, chunk_size=7)
    X = X[X[:, 0] > 0.1]
    Y = mt.tensor(raw_Y, chunk_size=(5, 3))
    Y = Y[Y[:, 0] > 0.1]

    nn = NearestNeighbors(n_neighbors=3)
    nn.fit(X)

    ret = nn.kneighbors(Y)

    x2 = raw_X[raw_X[:, 0] > 0.1]
    y2 = raw_Y[raw_Y[:, 0] > 0.1]
    snn = SkNearestNeighbors(n_neighbors=3)
    snn.fit(x2)
    expected = snn.kneighbors(y2)

    result = ret.fetch()
    assert nn._fit_method == snn._fit_method
    np.testing.assert_almost_equal(result[0], expected[0])
    np.testing.assert_almost_equal(result[1], expected[1])

    # test fit a sklearn tree
    nn = NearestNeighbors(n_neighbors=3)
    nn.fit(snn._tree)

    ret = nn.kneighbors(Y)
    result = ret.fetch()
    assert nn._fit_method == snn._fit_method
    np.testing.assert_almost_equal(result[0], expected[0])
    np.testing.assert_almost_equal(result[1], expected[1])
#Select top
us_city_user_rating = rating_popular_rest[
    rating_popular_rest['city'].str.contains(
        "Las Vegas|Pheonix|Toronto|Scattsdale|Charlotte|Tempe|Chandler|Cleveland|Madison|Gilbert"
    )]
#us_canada_user_rating.head()
us_city_user_rating = us_city_user_rating.drop_duplicates(['user_id', 'name'])
restaurant_features = us_city_user_rating.pivot(
    index='name', columns='user_id', values='restaurant_rating').fillna(0)

from scipy.sparse import csr_matrix
restaurant_features_matrix = csr_matrix(restaurant_features.values)

from sklearn.neighbors import NearestNeighbors
knn_recomm = NearestNeighbors(metric='cosine', algorithm='brute')
knn_recomm.fit(restaurant_features_matrix)

randomChoice = np.random.choice(restaurant_features.shape[0])
distances, indices = knn_recomm.kneighbors(
    restaurant_features.iloc[randomChoice].values.reshape(1, -1),
    n_neighbors=11)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for Restaurant {0} on priority basis:\n'.format(
            restaurant_features.index[randomChoice]))
    else:
        print('{0}: {1}'.format(
            i, restaurant_features.index[indices.flatten()[i]]))
Example #40
0
class KNearestNeighborsAssignment(object):
    """   """
    def __init__(self,
                 feature_name,
                 max_distance,
                 data_directory="",
                 n_neighbors=1,
                 algorithm="ball_tree",
                 weights="distance"):
        """
        """
        self.feature_dim = None
        self.feature_name = feature_name
        self.trained = False
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.nb_samples = 0
        self.algorithm = algorithm
        self.max_distance = max_distance
        if data_directory == "":
            self.data_directory = "/tmp"
        else:
            self.data_directory = data_directory
        self.model = NearestNeighbors(n_neighbors=self.n_neighbors,
                                      algorithm=self.algorithm)
        try:
            data = np.load(self.data_directory + "/" + self.feature_name +
                           "_knn_classif.npz")
            self.X = list(data["x"])
            self.Y = list(data["y"])
            self.nb_samples = len(self.X)
            self.feature_dim = len(self.X[0])
            if self.n_neighbors is None:
                self.n_neighbors = math.sqrt(len(self.X[0]))
            self.train()
        except Exception:
            self.X = []
            self.Y = []

    def train(self):
        """
        """
        self.model.fit(np.array(self.X))

    def update(self, feature, label):
        """
        """
        if self.feature_dim is None:
            self.feature_dim = len(feature)
        self.X.append(feature)
        self.Y.append(label)
        self.nb_samples += 1

    def predict(self, feature):
        """
        """
        distances, matchs = self.model.kneighbors([feature])
        distance = distances[0]
        if distance > self.max_distance:
            return False, "unknown", 0.0
        indice = matchs[0][0]
        label = self.Y[indice]
        return True, label, distance

    def __del__(self):
        """
        """
        pass
        # TODO: save data
    def save(self, file):
        file = open(file, 'w')
        pickle.dump(self.knn, file)
        file.close()

    def load(self, file):
        file = open(file, 'r')
        self.knn = pickle.load(file)
        file.close()
Example #41
0
class SMOTE(object):
    """Implementation of Synthetic Minority Over-Sampling Technique (SMOTE).

    SMOTE performs oversampling of the minority class by picking target 
    minority class samples and their nearest minority class neighbors and 
    generating new samples that linearly combine features of each target 
    sample with features of its selected minority class neighbors [1].

    Parameters
    ----------
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and P. Kegelmeyer. "SMOTE:
           Synthetic Minority Over-Sampling Technique." Journal of Artificial
           Intelligence Research (JAIR), 2002.
    """

    def __init__(self, k_neighbors=5, random_state=None):
        self.k = k_neighbors
        self.random_state = random_state

    def sample(self, n_samples):
        """Generate samples.

        Parameters
        ----------
        n_samples : int
            Number of new synthetic samples.

        Returns
        -------
        S : array, shape = [n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)

        S = np.zeros(shape=(n_samples, self.n_features))
        # Calculate synthetic samples.
        for i in range(n_samples):
            j = np.random.randint(0, self.X.shape[0])

            # Find the NN for each sample.
            # Exclude the sample itself.
            nn = self.neigh.kneighbors(self.X[j].reshape(1, -1),
                                       return_distance=False)[:, 1:]
            nn_index = np.random.choice(nn[0])

            dif = self.X[nn_index] - self.X[j]
            gap = np.random.random()

            S[i, :] = self.X[j, :] + gap * dif[:]

        return S

    def fit(self, X):
        """Train model based on input data.

        Parameters
        ----------
        X : array-like, shape = [n_minority_samples, n_features]
            Holds the minority samples.
        """
        self.X = X
        self.n_minority_samples, self.n_features = self.X.shape

        # Learn nearest neighbors.
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1)
        self.neigh.fit(self.X)

        return self
Example #42
0
inputDataFrame = loadDataFrame()(inputDatabase, limitsFileName, inputTable, date_update, date_download)

inputDataFrame = limitDataUsingProcentiles(inputDataFrame)
print(inputDataFrame.describe())
#inputDataFrame = pd.read_csv("NizhnyNovgorod_dataframe.csv")
#print(inputDataFrame.describe())

if inputQuery == "":
    inputDataFrame = inputDataFrame[featureNames]
    DataFrame_values = inputDataFrame.values
    FEATURE_DEFAULTS = ((inputDataFrame.max() + inputDataFrame.min()) * 0.5).to_dict()
    preprocessor = StandardScaler()
    preprocessor.fit(DataFrame_values)
    DataFrame_values = preprocessor.transform(DataFrame_values)
    neigh = NearestNeighbors(algorithm='kd_tree',n_jobs=-1)
    neigh.fit(DataFrame_values)

    modelPacket = dict()
    modelPacket['model'] = neigh
    modelPacket['preprocessor'] = preprocessor
    modelPacket['feature_names'] = featureNames
    modelPacket['feature_defaults'] = FEATURE_DEFAULTS
    if modelFileName != "":
        with open(modelFileName, 'wb') as fid:
            cPickle.dump(modelPacket, fid)
else:
    with open(modelFileName, 'rb') as fid:
        modelPacket = cPickle.load(fid)

        NEIGH_MODEL = modelPacket['model']
        PREPROCESSOR = modelPacket['preprocessor']
Example #43
0
    def runPassModel(
        self, pass_model_type="knn", agg_level="player",
        knn_model_file=None, expected_knn_file=None
    ):

        # Build model on df_pass_model copy of df_passes
        df_pass_model = self.df_passes.copy()
        timestamp = datetime.datetime.strftime(
            datetime.datetime.today(), "%Y_%m_%d_%H_%M_%S")

        if pass_model_type == "knn":
            simple_model_cols = [
                "id", "duration", "pass_length", "pass_angle",
                "location_origin_x", "location_origin_y",
                "location_dest_x", "location_dest_y"]
            df_knn = self.df_passes[simple_model_cols].set_index("id")
            df_completions = self.df_passes[["id", "pass_outcome_name"]]\
                .set_index("id")
            completion_array = np.array(np.where(
                df_completions["pass_outcome_name"] == "Complete", 1, 0))

            # Scale Data and Build Model
            min_max_scaler = MinMaxScaler()
            df_knn_norm = min_max_scaler.fit_transform(df_knn)
            n_neighbors = int(np.floor(len(df_knn) / 100)) + 1

            if knn_model_file is not None:
                nn_model = pickle.load(open("data/" + knn_model_file, 'rb'))
            else:
                nn_model = NearestNeighbors(
                    algorithm='ball_tree', n_neighbors=n_neighbors, p=2,
                    metric="euclidean", metric_params=None)
                nn_model.fit(df_knn_norm)
                pickle.dump(
                    nn_model, open("data/knn_model_file_" + timestamp, 'wb'))

            if expected_knn_file is not None:
                expected_pass_rate = pickle.load(
                    open("data/" + expected_knn_file, 'rb'))
            else:
                completion_array = np.array(
                    df_pass_model["pass_outcome_name_binary"])
                expected_pass_rate = []
                passes_per_ep = []
                print(f"Total Number of Passes: {len(df_knn)}")
                n = 0
                for row in df_knn_norm:
                    sim_passes = self.get_similar_passes(
                        row.reshape(1, -1), df_knn_norm, nn_model, cutoff=.2)
                    passes_per_ep.append(len(sim_passes))
                    expected_value = completion_array[sim_passes].mean()
                    expected_pass_rate.append(expected_value)
                    n += 1
                    if n % 5000 == 0:
                        print(f"Progress: {n} of {len(df_knn_norm)}")
                pickle.dump(
                    expected_pass_rate,
                    open('expected_knn_file_' + timestamp, 'wb'))

            df_pass_model["xP"] = expected_pass_rate

        elif pass_model_type == "box_grid":
            origin_box, dest_box = [], []
            for i, x in self.df_passes[[
                "location_origin_x", "location_origin_y",
                "location_dest_x", "location_dest_y"
            ]].iterrows():
                x, y = self.make_pass_grid(
                    x[0], x[1], x[2], x[3],
                    nrows=np.linspace(0, 120, 13), ncols=np.linspace(0, 80, 9))
                origin_box.append(x)
                dest_box.append(y)
                if i % 5000 == 0:
                    print(f"Pass {i} of {len(self.df_passes)}: {round(100*float(i)/len(self.df_passes),2)}% ")

            df_pass_model.loc[:, "origin_box"] = origin_box
            df_pass_model.loc[:, "dest_box"] = dest_box

            df_pass_model["pass_desc"] = list(zip(
                df_pass_model["origin_box"], df_pass_model["dest_box"]))

            # Get expected value (average) for each grid combination

            pass_grid_dict = df_pass_model\
                .groupby("pass_desc")["pass_outcome_name_binary"]\
                .mean().to_dict()

            df_pass_model.loc[:, ("xP")] = df_pass_model["pass_desc"]\
                .map(pass_grid_dict)

        if agg_level == "player":
            # df_pass_model['pass_direction'] = df_pass_model['pass_angle']\
            #     .apply(self.pass_direction)
            df_pass_model["position_name_parsed"] = (
                df_pass_model["position_name"].apply(
                    self.position_base_parser))
            df_pass_model["position_detail_parsed"] = (
                df_pass_model["position_name"].apply(
                    self.position_detail_parser))

            passing_model = df_pass_model\
                .groupby(["team_name", "player_name"], as_index=False)\
                .agg({
                    "position_name_parsed": "max",
                    "position_detail_parsed": "max",
                    "pass_outcome_name_binary": ["count", "sum"],
                    "xP": ["sum", "mean"]})

            passing_model.columns = [
                "Team", "Player", "Position", "Position_Detail",
                "Passes", "Completed", "xP", "xP_Mean"]

            passing_model["xP_Rating"] = (
                passing_model["Completed"] / passing_model["xP"])
            passing_model["comp_pct"] = (
                passing_model["Completed"] / passing_model["Passes"])

        elif agg_level == "team":

            passing_model = df_pass_model\
                .groupby(["team_name"], as_index=False)\
                .agg({
                    "pass_outcome_name_binary": ["count", "sum"],
                    "xP": ["sum", "mean"]})

            passing_model.columns = [
                "Team", "Passes", "Completed", "xP", "xP_Mean"]

            passing_model["xP_Rating"] = (
                passing_model["Completed"] / passing_model["xP"])
            passing_model["comp_pct"] = (
                passing_model["Completed"] / passing_model["Passes"])
        else:
            # self.passing_model = None
            print("Choose player or team")
            return None

        self.df_pass_model = df_pass_model
        self.passing_model = passing_model
        return passing_model
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """

        random_state = check_random_state(self.random_state)

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # If we need to offer support for the indices
        if self.return_indices:
            idx_under = np.flatnonzero(y == self.min_c_)

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it
            if key == self.min_c_:
                continue

            # Randomly get one sample from the majority class
            # Generate the index to select
            idx_maj = np.flatnonzero(y == key)
            idx_maj_sample = idx_maj[random_state.randint(
                low=0, high=self.stats_c_[key], size=self.n_seeds_S)]
            maj_sample = X[idx_maj_sample]

            # Create the set C
            C_x = np.append(X_min, maj_sample, axis=0)
            C_y = np.append(y_min, [key] * self.n_seeds_S)

            # Create the set S with removing the seed from S
            # since that it will be added anyway
            idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0)
            S_x = X[idx_maj_extracted]
            S_y = y[idx_maj_extracted]

            # Fit C into the knn
            self.estimator_.fit(C_x, C_y)

            # Classify on S
            pred_S_y = self.estimator_.predict(S_x)

            # Find the misclassified S_y
            sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :]
            sel_y = S_y[np.flatnonzero(pred_S_y != S_y)]

            # If we need to offer support for the indices selected
            # We concatenate the misclassified samples with the seed and the
            # minority samples
            if self.return_indices:
                idx_tmp = idx_maj_extracted[np.flatnonzero(pred_S_y != S_y)]
                idx_under = np.concatenate(
                    (idx_under, idx_maj_sample, idx_tmp), axis=0)

            X_resampled = np.concatenate((X_resampled, maj_sample, sel_x),
                                         axis=0)
            y_resampled = np.concatenate(
                (y_resampled, [key] * self.n_seeds_S, sel_y), axis=0)

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        nn.fit(X_resampled)
        nns = nn.kneighbors(X_resampled, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        self.logger.debug('Looking for majority Tomek links ...')
        links = TomekLinks.is_tomek(y_resampled, nns, self.min_c_)

        self.logger.info('Under-sampling performed: %s',
                         Counter(y_resampled[np.logical_not(links)]))

        # Check if the indices of the samples selected should be returned too
        if self.return_indices:
            # Return the indices of interest
            return (X_resampled[np.logical_not(links)],
                    y_resampled[np.logical_not(links)],
                    idx_under[np.logical_not(links)])
        else:
            # Return data set without majority Tomek links.
            return (X_resampled[np.logical_not(links)],
                    y_resampled[np.logical_not(links)])
Example #45
0
def fit_knn_model(E_train_flatten, knn_neighbors, knn_metric):
    # Fit kNN model on training images
    ## print("Fitting k-nearest-neighbour model on training images...")
    knn = NearestNeighbors(n_neighbors=knn_neighbors, metric=knn_metric)
    knn.fit(E_train_flatten)
    return knn
Example #46
0
def finetune(novel_loader, n_query=15, freeze_backbone=False, n_way=5, n_support=5):

    iter_num = len(novel_loader)

    acc_all_ori = []
    acc_all_lp = []

    if params.use_saved:
        save_dir = '%s/saved' % configs.save_dir
    else:
        save_dir = '%s/checkpoints' % configs.save_dir

    P_matrix_file = os.path.join(save_dir, 'P_matrix.npy')
    P_matrix = torch.from_numpy(np.load(P_matrix_file)).float().cuda()

    for _, (x, y) in enumerate(novel_loader):
        pretrained_model = []
        classifier = []
        classifier_opt = []
        delta_opt = []
        ###############################################################################################
        # load pretrained model on miniImageNet
        for i in range(params.M):
            pretrained_model.append(PFinetune(model_dict[params.model], P_matrix[i]))
            checkpoint_dir = '%s/%s_%s' % (save_dir, params.model, params.method)
            if params.train_aug:
                checkpoint_dir += '_aug'
            modelfile = os.path.join(checkpoint_dir, '%s_%s_e%d.tar' % (params.model, params.method, i))
            tmp = torch.load(modelfile)
            pretrained_dict = tmp['state']

            new_dict = pretrained_model[i].state_dict()
            pretrained_dict = {u: v for u, v in pretrained_dict.items()
                               if u in new_dict}
            new_dict.update(pretrained_dict)
            pretrained_model[i].load_state_dict(new_dict)

            classifier.append(Classifier(pretrained_model[i].final_feat_dim, n_way))

            classifier_opt.append(torch.optim.SGD(
                classifier[i].parameters(),
                lr=0.01, momentum=0.9, dampening=0.9,
                weight_decay=0.001))

            if freeze_backbone is False:
                delta_opt.append(torch.optim.SGD(
                    filter(lambda p: p.requires_grad, pretrained_model[i].parameters()),
                    lr=0.01))

            pretrained_model[i].cuda()
            classifier[i].cuda()
            ###############################################################################################
            if freeze_backbone is False:
                pretrained_model[i].train()
            else:
                pretrained_model[i].eval()

            classifier[i].train()

        ###############################################################################################
        n_query = x.size(1) - n_support
        x = x.cuda()
        x_var = Variable(x)
    
        batch_size = 4
        support_size = n_way * n_support

        y_a_i = Variable(torch.from_numpy(np.repeat(range(n_way), n_support))).cuda()

        x_b_i = x_var[:, n_support:, :, :, :].contiguous().view(n_way * n_query, *x.size()[2:])
        x_a_i = x_var[:, :n_support, :, :, :].contiguous().view(n_way * n_support, *x.size()[2:])

        ###############################################################################################
        loss_fn = nn.CrossEntropyLoss().cuda()
        ###############################################################################################
        total_epoch = 100

        for epoch in range(total_epoch):
            rand_id = np.random.permutation(support_size)

            for j in range(0, support_size, batch_size):

                #####################################
                selected_id = torch.from_numpy(rand_id[j: min(j+batch_size, support_size)]).cuda()
               
                z_batch = x_a_i[selected_id]
                y_batch = y_a_i[selected_id] 
                #####################################
                for i in range(params.M):
                    classifier_opt[i].zero_grad()
                    if freeze_backbone is False:
                        delta_opt[i].zero_grad()
                    output = pretrained_model[i](z_batch)
                    output = classifier[i](output)

                    loss = loss_fn(output, y_batch)
                    #####################################
                    loss.backward()

                    classifier_opt[i].step()
                    if freeze_backbone is False:
                        delta_opt[i].step()

        scores_ori = 0
        scores_lp = 0

        y_query = np.repeat(range(n_way), n_query)

        n_lp = len(y_query)
        del_n = int(n_lp * (1.0 - params.delta))
        with torch.no_grad():
            for i in range(params.M):
                pretrained_model[i].eval()
                classifier[i].eval()

                output = pretrained_model[i](x_b_i)
                scores_i = classifier[i](output)
                scores_i = F.softmax(scores_i, 1)

                scores_ori += scores_i

                x_lp = output.cpu().numpy()
                y_lp = scores_i.cpu().numpy()
                neigh = NearestNeighbors(params.k_lp)
                neigh.fit(x_lp)
                d_lp, idx_lp = neigh.kneighbors(x_lp)
                d_lp = np.power(d_lp, 2)
                sigma2_lp = np.mean(d_lp)

                for i in range(n_way):
                    yi = y_lp[:, i]
                    top_del_idx = np.argsort(yi)[0:del_n]
                    y_lp[top_del_idx, i] = 0

                w_lp = np.zeros((n_lp, n_lp))
                for i in range(n_lp):
                    for j in range(params.k_lp):
                        xj = idx_lp[i, j]
                        w_lp[i, xj] = np.exp(-d_lp[i, j] / (2 * sigma2_lp))
                        w_lp[xj, i] = np.exp(-d_lp[i, j] / (2 * sigma2_lp))
                q_lp = np.diag(np.sum(w_lp, axis=1))
                q2_lp = sqrtm(q_lp)
                q2_lp = np.linalg.inv(q2_lp)
                L_lp = np.matmul(np.matmul(q2_lp, w_lp), q2_lp)
                a_lp = np.eye(n_lp) - params.alpha * L_lp
                a_lp = np.linalg.inv(a_lp)
                ynew_lp = np.matmul(a_lp, y_lp)

                scores_lp += ynew_lp

        count_this = len(y_query)

        topk_scores, topk_labels = scores_ori.data.topk(1, 1, True, True)
        topk_ind_ori = topk_labels.cpu().numpy()
        top1_correct_ori = np.sum(topk_ind_ori[:, 0] == y_query)
        correct_ori = float(top1_correct_ori)
        print('BSR (Ensemble): %f' % (correct_ori / count_this * 100))
        acc_all_ori.append((correct_ori / count_this * 100))

        topk_ind_lp = np.argmax(scores_lp, 1)
        top1_correct_lp = np.sum(topk_ind_lp == y_query)
        correct_lp = float(top1_correct_lp)
        print('BSR+LP (Ensemble): %f' % (correct_lp / count_this * 100))
        acc_all_lp.append((correct_lp / count_this * 100))
        ###############################################################################################

    acc_all_ori = np.asarray(acc_all_ori)
    acc_mean_ori = np.mean(acc_all_ori)
    acc_std_ori = np.std(acc_all_ori)
    print('BSR (Ensemble): %d Test Acc = %4.2f%% +- %4.2f%%' %
          (iter_num, acc_mean_ori, 1.96 * acc_std_ori / np.sqrt(iter_num)))

    acc_all_lp = np.asarray(acc_all_lp)
    acc_mean_lp = np.mean(acc_all_lp)
    acc_std_lp = np.std(acc_all_lp)
    print('BSR+LP (Ensemble): %d Test Acc = %4.2f%% +- %4.2f%%' %
          (iter_num, acc_mean_lp, 1.96 * acc_std_lp / np.sqrt(iter_num)))
Example #47
0
def select_coordinate(rule, x, A, b, args):
    """ Adaptive selection rules """

    n_params = x.size
    n_samples, n_features = A.shape

    g_func = args["g_func"]

    epoch = args["epoch"]
    lipschitz = args["lipschitz"]
    rng = args["rng"]

    if rule == 'cyclic':
        # Get the next coordinate
        coordinate = epoch % n_params

    elif rule == 'random':
        # Get a random coordinate from a uniform distribution
        coordinate = rng.randint(n_params)

    elif rule in ['heapGSL', "heapGS"]:
        # use heap to keep track of gradients
        if epoch == 0:
            if rule == "heapGS":
                scores = np.abs(g_func(x, A, b, args, None))

            elif rule == "heapGSL":
                g = g_func(x, A, b, args, None)
                gL = g / np.sqrt(lipschitz)
                scores = np.abs(gL)

            heap, h2l, l2h, n_elements = heapOps.create_heap(scores)
            d_matrix, d_index, max_depend = get_dependancy_matrix(A)

            args["d_matrix"] = d_matrix
            args["d_index"] = d_index
            args["heap"] = heap
            args["h2l"] = h2l
            args["l2h"] = l2h
            args["n_elements"] = n_elements

        h2l = args["h2l"]
        l2h = args["l2h"]

        return args["h2l"][0], args

    elif rule == 'lipschitz':

        if epoch == 0:
            L = args["lipschitz"]
            args["lipschitz_cumSum"] = np.cumsum(L / np.sum(L))

        value = rng.uniform(0, 1)
        coordinate = np.searchsorted(args["lipschitz_cumSum"], value)

    elif rule == 'GS':
        g = g_func(x, A, b, args, None)
        coordinate = np.argmax(np.abs(g))

    elif rule == 'GSL':
        lipschitz = args["lipschitz"]

        g = g_func(x, A, b, args, None)
        gL = g / np.sqrt(lipschitz)

        coordinate = np.argmax(np.abs(gL))

    elif rule in ["GSL-q", "GS-q"]:
        L = args["lipschitz"]
        L1 = args["L1"]

        if rule == "GS-q":
            Lprime = np.ones(n_params) * np.max(L)

        elif rule == "GSL-q":
            Lprime = L
        #block = np.unravel_index(block,  (n_features, n_classes))
        args["prox_lipschitz"] = Lprime
        dist = compute_dist(x, A, b, args)

        g = g_func(x, A, b, args, None)

        # Incorporating the L1 term values
        L1_next = L1 * abs(dist + x)
        L1_current = L1 * abs(x)

        # Complicated Equation in page 7 of the paper
        scores = g * dist + (Lprime / 2.) * dist * dist + L1_next - L1_current

        coordinate = np.argmin(scores)

    elif rule in ["GSL-r", "GS-r"]:
        L = args["lipschitz"]

        if rule == "GS-r":
            Lprime = np.ones(n_params) * np.max(L)

        elif rule == "GSL-r":
            Lprime = L
        #block = np.unravel_index(block,  (n_features, n_classes))
        args["prox_lipschitz"] = Lprime
        dist = compute_dist(x, A, b, args)

        # Complicated Equation in page 7 of the paper
        coordinate = np.argmax(np.abs(dist))

    elif rule in ["GSL-s", "GS-s"]:
        L1 = args["L1"]

        g = g_func(x, A, b, args, None)

        gP = np.zeros(g.shape)
        # Point zero-valued variables in the right direction
        ind_neg = g < -L1
        ind_pos = g > L1

        gP[ind_neg] = g[ind_neg] + L1
        gP[ind_pos] = g[ind_pos] - L1

        # Compute the real gradient for non zero-valued variables
        nonZeros = abs(x) > 1e-3
        gP[nonZeros] = g[nonZeros] + L1 * np.sign(x[nonZeros])

        if rule == 'GSL-s':
            gP /= np.sqrt(L)

        coordinate = np.argmax(abs(gP))

    elif rule in ["approxGS", "approxGSL"]:
        lipschitz = args["lipschitz"]
        if args["epoch"] == 0:
            knn = NearestNeighbors(n_neighbors=1,
                                   metric='euclidean',
                                   algorithm='ball_tree')
            if rule == "approxGS":
                A_scale = A
            elif rule == "approxGSL":
                A_scale = A / np.sqrt(lipschitz)

            A_ext = np.hstack([A_scale, -A_scale])

            args["knn"] = knn.fit(A_ext.T)

        r = args["b_pred"] - b
        coordinate = args["knn"].kneighbors(r)[1][0][0]

        coordinate %= n_features

    else:
        print "selection rule %s doesn't exist" % rule
        raise

    return coordinate, args
Example #48
0
def update_foot(RB, state_positions, i, order_idx, astar=None):
    '''
    This function moves Foot(i) to the position as close to the goal as possible, while remaining within all constraints.
    
    Strategy: Find initial reach space within max_foot_length, then remove all illegal configurations that break one of our constraints.
              From the remaining legal positions in the reach space, choose the one closest to the goal.
              
              Note: When removing illegal states from the list, we cannot use "for state in reach_space..." due to indexing issues
                    that arise when modifying the length of the reach_space list once already in the loop
                    (Also, without updating the current index you would skip over any state after one gets removed, and not check 
                    it for constraints)
    
    Parameters: RB - instance of ReachBot
                i - index of the current foot {1,2,3,4}, as pulled from the string name of the operator
                state_positions - list of all possible grid positions on the map, used to find all states within the max-radius
                order_idx - the index of this foot in the operators_ordered list, i.e. its turn in the movement cycle
                            used to determine if the foot is a "back foot", to make sure that it does not out-step the body
                astar - optional argument used to check an occupany grid, when we are working with obstacles
    
    Returns: ReachBot object with Foot(i) updated to its new position
    '''

    # Define the position of the body and all of its corners, used for connecting each foot to the body
    body_pos = RB.state[-1]
    corner_offset = [[0, 1], [1, 1], [0, 0], [1, 0]]
    corners = [
        np.array(body_pos) + corner_offset[c] for c in range(RB.num_feet)
    ]

    # Find initial reach space - list of grid squares within the radius of max_foot_length
    neigh = NearestNeighbors(radius=RB.max_foot_length)
    neigh.fit(state_positions)
    reach_space_indices = neigh.radius_neighbors([body_pos],
                                                 return_distance=False)
    reach_space = [state_positions[index] for index in reach_space_indices[0]]

    # Constraint: Check that the current foot is not occupying the same grid square as another foot nor the body
    occupied_coords = copy.copy(RB.state)
    del occupied_coords[
        i -
        1]  # Remove current foot from illegal positions (staying put is an option)
    m = len(reach_space)
    k = 0
    while k < m:
        candidate_state = reach_space[k]
        if candidate_state in occupied_coords:
            reach_space.remove(candidate_state)
            k -= 1  # Decrement total length of reach_space list
            m -= 1  # If we remove a state, all the next states will shift backward, so we need to check the current index again
        k += 1

    # Constraint: Check that the current foot is not intersecting other feet
    other_feet = np.delete(np.arange(RB.num_feet), i -
                           1)  # All feet except the current foot of interest

    for j in other_feet:
        # Set up line connecting the center of another foot to its corresponding body corner (as shown on plot)
        body1 = corners[j]
        foot1 = np.array(RB.state[j]) + [0.5, 0.5]

        m = len(reach_space)
        k = 0
        while k < m:
            # Set up line connecting the center of candidate foot position to the current corresponding body corner
            candidate_state = reach_space[k]
            body2 = corners[i - 1]
            foot2 = np.array(candidate_state) + [0.5, 0.5]

            if feet_intersecting(body1, foot1, body2, foot2):
                reach_space.remove(candidate_state)
                m -= 1
                k -= 1
            k += 1

    # Constraint: Check that foot does not intersect the body, using the center of the foot's current grid square
    m = len(reach_space)
    k = 0
    while k < m:
        candidate_state = reach_space[k]
        foot_pos = np.array(candidate_state) + [0.5, 0.5]
        if body_intersecting(np.array(body_pos), i - 1, foot_pos):
            reach_space.remove(candidate_state)
            k -= 1
            m -= 1
        k += 1

    # Constraint: Check that body is under proper tension
    # Method 1: Check that body is positioned inside min/max box defined by feet positions
    m = len(reach_space)
    k = 0
    while k < m:
        candidate_state = reach_space[k]
        if not check_tension(RB, i - 1, candidate_state):
            reach_space.remove(candidate_state)
            k -= 1
            m -= 1
        k += 1

    # Constraint: Check that body is under proper tension
    # Method 2: Make sure the back 2 feet are not outstepping (nearer to the goal than) the body or front 2 feet
    body_to_goal = np.linalg.norm(np.subtract(RB.state[-1], RB.goal[-1]))
    if (order_idx == 3) or (order_idx
                            == 4):  # For 4 feet, order index is {0,1,2,3,4}
        m = len(reach_space)
        k = 0
        while k < m:
            candidate_state = reach_space[k]
            foot_pos = np.array(candidate_state) + [0.5, 0.5]
            dist_to_goal = np.linalg.norm(np.subtract(foot_pos, RB.goal[-1]))
            if dist_to_goal <= body_to_goal:
                reach_space.remove(candidate_state)
                k -= 1
                m -= 1
            k += 1

    # Constraint: Check that foot is not placed inside an obstacle, using the center of the foot's current grid square
    if astar:
        m = len(reach_space)
        k = 0
        while k < m:
            candidate_state = reach_space[k]
            foot_pos = np.array(candidate_state) + [0.5, 0.5]
            if not astar.occupancy.is_free(foot_pos, buffer=0):
                reach_space.remove(candidate_state)
                k -= 1
                m -= 1
            k += 1

    # Check if there are any legal states available; if not, don't move
    if len(reach_space) == 0:
        return RB

    # Otherwise, move the foot as close as possible to the goal, from the remaining legal positions in its reach space
    else:
        foot_goal = RB.goal[i - 1]
        distances = [
            np.linalg.norm(np.subtract(foot_goal, state), ord=1)
            for state in reach_space
        ]
        p = np.argmin(distances)
        closest_position = reach_space[p]
        RB.move_foot(i - 1, closest_position)
        return RB
Example #49
0
def k_nearest_neighbors(coordinates,
                        neighbor_cutoff,
                        max_num_neighbors=None,
                        p_distance=2,
                        self_loops=False):
    """Find k nearest neighbors for each atom

    We do not guarantee that the edges are sorted according to the distance
    between atoms.

    Parameters
    ----------
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    self_loops : bool
        Whether to allow a node to be its own neighbor. Default to False.

    Returns
    -------
    srcs : list of int
        Source nodes.
    dsts : list of int
        Destination nodes, corresponding to ``srcs``.
    distances : list of float
        Distances between the end nodes, corresponding to ``srcs`` and ``dsts``.
    """
    num_atoms = coordinates.shape[0]
    model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance)
    model.fit(coordinates)
    dists_, nbrs = model.radius_neighbors(coordinates)
    srcs, dsts, dists = [], [], []
    for i in range(num_atoms):
        dists_i = dists_[i].tolist()
        nbrs_i = nbrs[i].tolist()
        if not self_loops:
            dists_i.remove(0)
            nbrs_i.remove(i)
        if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors:
            packed_nbrs = list(zip(dists_i, nbrs_i))
            # Sort neighbors based on distance from smallest to largest
            packed_nbrs.sort(key=lambda tup: tup[0])
            dists_i, nbrs_i = map(list, zip(*packed_nbrs))
            dsts.extend([i for _ in range(max_num_neighbors)])
            srcs.extend(nbrs_i[:max_num_neighbors])
            dists.extend(dists_i[:max_num_neighbors])
        else:
            dsts.extend([i for _ in range(len(nbrs_i))])
            srcs.extend(nbrs_i)
            dists.extend(dists_i)

    return srcs, dsts, dists
Example #50
0
    def search_neighbors(self):
        """
            Search neighbors based by minimizing Renyi-entropy
        caluculated from neighbors.
        """

        NN = NearestNeighbors(n_neighbors=self.max_k + 1,
                              metric="minkowski",
                              p=2)
        NN.fit(self.X)

        distance, index_list_list = NN.kneighbors(self.X,
                                                  n_neighbors=self.max_k + 1,
                                                  return_distance=True)

        neighbors_list = []

        for index_list in index_list_list:
            knn_neighbors = []
            if_first = True
            for index in index_list:
                if if_first:
                    if_first = False
                    continue
                else:
                    knn_neighbors.append(self.X[index])
            neighbors_list.append(knn_neighbors)

        QE_list = []

        #Kernel function
        def gaussian(a, b, band_width):
            norm = np.sqrt(np.dot(a - b, a - b))
            D = len(a)
            return (1 / np.sqrt(2 * np.pi * band_width)**(D / 2)) * np.exp(
                -norm**2 / (2 * (band_width**2)))

        def Quadratic_Entoropy(neighbors):
            QE = 0
            for x_i in neighbors:
                for x_j in neighbors:
                    QE += gaussian(x_i, x_j, 1)
            if QE == 0:
                QE += 1e-10
            QE = -math.log(QE / len(neighbors)**2)
            return QE

        true_neighbors_list, true_neighbors_index_list = [], []

        for neighbors, target in zip(tqdm(neighbors_list), self.X):
            QE = Quadratic_Entoropy(neighbors)
            # select optimal subset.
            potential_list = []

            for i in range(len(neighbors)):
                neighbors_copy = copy.deepcopy(neighbors)
                del (neighbors_copy[i])
                new_neighbors = neighbors_copy
                new_QE = Quadratic_Entoropy(new_neighbors)
                potential_list.append(QE - new_QE)

            #上から順にentoropyに貢献した要素
            sorted_potential_index = np.argsort(potential_list)[-1::-1]
            average_potential = np.mean(potential_list)

            remove_index = []
            for index in sorted_potential_index:
                if potential_list[index] > average_potential:
                    if len(remove_index) >= (self.max_k - self.min_k):
                        break
                    remove_index.append(index)

                else:
                    break
            true_neighbor_index = list(
                set(sorted_potential_index) - set(remove_index))
            true_neighbors = [
                neighbors[index] for index in true_neighbor_index
            ]

            true_neighbors_list.append(np.array(true_neighbors))
            true_neighbors_index_list.append(true_neighbor_index)

            #QE = Quadratic_Entoropy(true_neighbors)
            #QE_list.append(QE)

        self.neighbors_list = true_neighbors_list
        self.neighbors_index_list = true_neighbors_index_list
        return np.array(true_neighbors_list)
Example #51
0
def get_nearest_neighbour_idx(df,
                              dim=1,
                              sample_every_n=2000,
                              neighbours=10000):

    # K Nearest Neighbours - find 10000 nearest points per source and return accuracy
    # Whilst KNN fitting is pretty instant, KNN finding is really slow per source for large neighbours and does not have a multithread option. Therefore, I implement a multithread version of my own. This can take a long time, so always save output to disk.

    # Choose data to fit knn to - either 1-D feaeture, or all 10 features:
    if dim == 1:  # reshape array in this case due to knn fitting implementation
        df_knnfit = df.loc[data_all['features_test'].index.
                           values]['feature_1D'].values.reshape(-1, 1)
    if dim == 10:
        df_knnfit = df.loc[data_all['features_test'].index.values][[
            *feature_columns
        ]]

    knn = NearestNeighbors(
        n_neighbors=neighbours, algorithm='auto', n_jobs=1
    )  # keep n_jobs=1 else my multiprocess implementation of the knn_closest function has issues, the n_jobs tag is kept within the knn object.
    knn.fit(df_knnfit)  # fit the knn to the data, this is very quick.

    # select data points to find nn to. Sample_every_n allows a much faster version of this to run.
    if dim == 1:
        vals_closest = df.loc[data_all['features_test'].index.values][
            'feature_1D'][0::sample_every_n].values.reshape(
                -1, 1
            )  # reshape because it's only 1 number. [0::1000] sample every 1000 points?
    if dim == 10:
        vals_closest = df.loc[data_all['features_test'].index.values][[
            *feature_columns
        ]][0::sample_every_n].values

    # Pass a copy of the df used for fitting knn to the knn_closest function for referencing indices
    df_vals = df.loc[data_all['features_test'].index.values]

    num_workers = multiprocessing.cpu_count()
    print(
        'Starting multiproc finding {0} nearest neighbours (in {1}-D) on {2} sources with {3} CPUs... could take 30mins or more if sources/CPUs is more than 100... (should linearly scale: 2X CPU = 1/2 time)'
        .format(neighbours, dim, len(vals_closest), num_workers))

    # because knn needs more than one entry at a time in the input array else it complains, multiprocess wont work in the standard way. So I split the master array up into segments and feed them in as an array of arrays. Perhaps there is a better way to do this but this works and should have virtually no decrease in speed. Waiting for the day sklearn knn has built in multithread into knn.neighbours().
    vals_closest_split = np.array_split(
        np.array(vals_closest),
        len(vals_closest) /
        20)  # split into chunks containing subsets of sources

    with multiprocessing.Pool(processes=num_workers) as pool:
        closest_dictionary_list = pool.starmap(
            knn_closest,
            zip(vals_closest_split, itertools.repeat(knn),
                itertools.repeat(df_vals), itertools.repeat(neighbours))
        )  #btw a starmap is a way to pass multiple arguments to a function using multi-process
        pool.close()
        pool.join()

    #closest_idx = knn_closest(vals_closest, knn, df_vals, neighbours) # Single core run? Not needed, multicore will work for all runs now.
    print('knn multiproc finished... sorting outputs...')
    # Now sort the output from multiproc
    # Initialise arrays of correct shape using first value
    closest_idx_concat = np.array(
        np.array(closest_dictionary_list[0]['closest_idx']))
    closest_idx_concat_int = np.array(
        np.array(closest_dictionary_list[0]['closest_int']))
    # loop over list of dictionaries, stack the _idx and _int arrrays together for all source that were initialliy split up in vals_closest_split during multiprocess.
    for i in range(
            1, len(closest_dictionary_list),
            1):  # Start from 1 because first entry already initalised above.
        closest_idx_concat = np.vstack(
            (closest_idx_concat,
             np.array(closest_dictionary_list[i]['closest_idx'])))
        closest_idx_concat_int = np.vstack(
            (closest_idx_concat_int,
             np.array(closest_dictionary_list[i]['closest_int'])))

    # Save output to disk
    print('Saving knn indices to disk as "closest_idx_concat_' + str(dim) +
          '.pkl"... ')
    save_obj(closest_idx_concat, 'closest_idx_concat_' + str(dim) + 'D')
Example #52
0
class RankedMinorityOversampler(object):
    """Implementation of Ranked Minority Oversampling (RAMO).

    Oversample the minority class by picking samples according to a specified
    sampling distribution.

    Parameters
    ----------
    k_neighbors_1 : int, optional (default=5)
        Number of nearest neighbors used to adjust the sampling probability of
        the minority examples.
    k_neighbors_2 : int, optional (default=5)
        Number of nearest neighbors used to generate the synthetic data
        instances.
    alpha : float, optional (default=0.3)
        Scaling coefficient.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.
    """
    def __init__(self,
                 k_neighbors_1=5,
                 k_neighbors_2=5,
                 alpha=0.3,
                 random_state=None):
        self.k_neighbors_1 = k_neighbors_1
        self.k_neighbors_2 = k_neighbors_2
        self.alpha = alpha
        self.random_state = random_state

    def sample(self, n_samples):
        """Generate samples.

        Parameters
        ----------
        n_samples : int
            Number of new synthetic samples.

        Returns
        -------
        S : array, shape = [n_samples, n_features]
            Returns synthetic samples.
        """
        np.random.seed(seed=self.random_state)

        S = np.zeros(shape=(n_samples, self.n_features))
        # Calculate synthetic samples.
        for i in range(n_samples):
            # Choose a sample according to the sampling distribution, r.
            j = np.random.choice(self.n_minority_samples, p=self.r)

            # Find the NN for each sample.
            # Exclude the sample itself.
            nn = self.neigh_2.kneighbors(self.X_min[j].reshape(1, -1),
                                         return_distance=False)[:, 1:]
            nn_index = np.random.choice(nn[0])

            dif = self.X_min[nn_index] - self.X_min[j]
            gap = np.random.random()

            S[i, :] = self.X_min[j, :] + gap * dif[:]

        return S

    def fit(self, X, y, sample_weight=None, minority_target=None):
        """Train model based on input data.

        Parameters
        ----------
        X : array-like, shape = [n_total_samples, n_features]
            Holds the majority and minority samples.
        y : array-like, shape = [n_total_samples]
            Holds the class targets for samples.
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights multiplier. If None, the multiplier is 1.
        minority_target : int, optional (default=None)
            Minority class label.
        """
        if minority_target is None:
            # Determine the minority class label.
            stats_c_ = Counter(y)
            maj_c_ = max(stats_c_, key=stats_c_.get)
            min_c_ = min(stats_c_, key=stats_c_.get)
            self.minority_target = min_c_
        else:
            self.minority_target = minority_target

        self.X_min = X[y == self.minority_target]
        self.n_minority_samples, self.n_features = self.X_min.shape

        neigh_1 = NearestNeighbors(n_neighbors=self.k_neighbors_1 + 1)
        neigh_1.fit(X)
        nn = neigh_1.kneighbors(self.X_min, return_distance=False)[:, 1:]

        if sample_weight is None:
            sample_weight_min = np.ones(shape=(len(self.minority_target)))
        else:
            assert (len(y) == len(sample_weight))
            sample_weight_min = sample_weight[y == self.minority_target]

        self.r = np.zeros(shape=(self.n_minority_samples))
        for i in range(self.n_minority_samples):
            majority_neighbors = 0
            for n in nn[i]:
                if y[n] != self.minority_target:
                    majority_neighbors += 1

            self.r[i] = 1. / (1 + np.exp(-self.alpha * majority_neighbors))

        self.r = (self.r * sample_weight_min).reshape(1, -1)
        self.r = np.squeeze(normalize(self.r, axis=1, norm='l1'))

        # Learn nearest neighbors.
        self.neigh_2 = NearestNeighbors(n_neighbors=self.k_neighbors_2 + 1)
        self.neigh_2.fit(self.X_min)

        return self
Example #53
0

# changed from relative to to full path
strains = pd.read_csv("https://raw.githubusercontent.com/Build-Week-Med-Cabinet-3/Data-Science/master/API/nn_model_strains.csv")

nlp=English()
tokenizer = Tokenizer(nlp.vocab)
tf = TfidfVectorizer(stop_words="english")
transformer = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=0.98, ngram_range=(1,3))


dtm = transformer.fit_transform(strains['lemmas'])
dtm = pd.DataFrame(dtm.todense(), columns=transformer.get_feature_names())

model = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
model.fit(dtm)

def predict(request_text):
    '''Prediction for user'''
    transformed = transformer.transform([request_text])
    dense = transformed.todense()
    recommendations = model.kneighbors(dense)[1][0]
    output_array = []
    for recommendation in recommendations:
        strain = strains.iloc[recommendation]
        output = strain.drop(['Unnamed: 0', 'name', 'ailment', 'all_text', 'lemmas']).to_dict()
        output_array.append(output)
        print('output_array')
    return output_array

def create_app():
Example #54
0
    def interpolate(self, x0, d0, x1, *args):
        self.x0 = x0
        self.d0 = d0
        self.x1 = x1

        print(self.x0.shape)
        print(self.x1.shape)

        if self.method == 'scatteredinterpolant':
            F = LinearNDInterpolatorExt(points=(self.x0[:, 0], self.x0[:, 1],
                                                self.x0[:, 2]),
                                        values=self.d0)
            d1 = np.array(F(self.x1[:, 0], self.x1[:, 1],
                            self.x1[:, 2])).reshape(self.x1.shape[0], 1)

        elif self.method == 'rbf':
            # print('x0-shape',self.x0.shape)
            # print('d0-shape',self.d0.shape)
            op = rbfcreate(self.x0.T, self.d0.T, 'RBFFunction', 'multiquadric',
                           'RBFConstant', self.rbfConstant)
            rbfcheck(op)
            out = rbfinterp(self.x1.T, op)
            d1 = np.array(out[0]).reshape(self.x1.shape[0], 1)
            print(d1)
            print('d1-shape', d1.shape)

            # F = Rbf(self.x0[:,0],self.x0[:,1],self.x0[:,2],self.d0,function='linear',epsilon=self.rbfConstant)
            # d1 = F(self.x1[:,0],self.x1[:,1],self.x1[:,2])

        elif self.method == 'localSmoothing':
            pass

        else:
            print('Interpolation Method not Found')

        # Workout if there are any points on the surface that are < 0
        d1[d1 < 0] = 0

        #  work out which points on the surface are too far away from real data
        # Remove any interpolated values which are outwith the fill threshold

        from sklearn.neighbors import NearestNeighbors

        neigh = NearestNeighbors(n_neighbors=1,
                                 radius=1,
                                 algorithm='auto',
                                 leaf_size=30,
                                 metric='minkowski',
                                 p=2)

        neigh.fit(self.x0)

        id = neigh.kneighbors(self.x1, return_distance=False)
        cPts = self.x0[id]
        cPts = np.array(cPts[:, 0])

        d = distBetweenPoints(cPts, self.x1)

        thresholdDistance = np.zeros(shape=d.shape, dtype=np.bool)
        thresholdDistance[d > self.distanceThreshold] = 1
        d1[thresholdDistance] = 'nan'
        d1 = d1.flatten()

        return d1
Example #55
0
def knn_distance(my_df, n):
    model = NearestNeighbors(n_jobs=7)
    model.fit(my_df)
    output = model.kneighbors(np.array(my_df), n_neighbors=n)
    output = output[0].sum(axis=1)
    return output
Example #56
0
#                                                         ###############
# 2- Prediciting unkown rating for item i                 ############################
# for an active user u by calcauting                      ##############################
# the weighted sum of all the users for the item          #####################################
#                                                         ##########################################
# 3- Recommending the new items to the user               ##############################################
#                                                         ################################################
#                                                         ###################################################
###########################################################
""")

# KNN part
k = 5
neighbor = NearestNeighbors(k, 'cosine')
# fit training data to KNN
neighbor.fit(ratings_train)
#Calculate the top five similar users for each user and their similarity values, that is the distance values between each pair of users
top_k_distances, top_k_users = neighbor.kneighbors(ratings_train,
                                                   return_distance=True)
print("Shape==============>", top_k_users.shape, top_k_distances.shape)
# top five users similar to user 1
print(top_k_users[0])
# choose only top five users for each user and use their rating information
#while prediciting the ratings using the weighted sum of all of the ratings of these top five similar users
user_pred_k = np.zeros(ratings_train.shape)
for i in range(ratings_train.shape[0]):
    user_pred_k[i, :] = top_k_distances[i].T.dot(
        ratings_train[top_k_users][i]) / np.array(
            [np.abs(top_k_distances[i].T).sum(axis=0)]).T

print(user_pred_k.shape)
Example #57
0
from __future__ import print_function
import argparse
from sklearn.neighbors import NearestNeighbors

parser = argparse.ArgumentParser()
parser.add_argument("npy_file")
parser.add_argument("idx_file")
parser.add_argument("--n_neighbors", type=int, default=1)
parser.add_argument("--algorithm",
                    choices=['auto', 'ball_tree', 'kd_tree', 'brute'],
                    default='auto')
args = parser.parse_args()

nn = NearestNeighbors(n_neighbors=args.n_neighbors, algorithm=args.algorithm)
nn.fit(X)
Example #58
0
tfidftransformer = TfidfVectorizer(ngram_range=(1, 1),
                                   stop_words=text.ENGLISH_STOP_WORDS.union(
                                       Z[:10]))
X = tfidftransformer.fit_transform([
    m + ' ' + n for m, n in zip(df['description_clean'], df['title'])
])  # using both desc and tile to predict
print('shape', X.shape)

X = normalize(X, norm='l2', axis=1)
nbrs_brute = NearestNeighbors(n_neighbors=X.shape[0],
                              algorithm='brute',
                              metric='cosine')

print('fitting')
nbrs_brute.fit(X.todense())
print('fitted')

# del(X)
# gc.collect()

sow = open('input.txt', 'r').read()
sow = tfidftransformer.transform([sow])
sow = normalize(sow, norm='l2', axis=1)

print('scoring standards')
distances, indices = nbrs_brute.kneighbors(sow.todense())
distances = list(distances[0])
indices = list(indices[0])

for indx in indices[:10]:
Example #59
0
df_data.head()

#%%

df_data['key'] = df_data['key']/12
max_loudness = df_data['loudness'].max()
df_data['loudness'] = abs(df_data['loudness']/max_loudness)
max_tempo = df_data['tempo'].max()
df_data['tempo'] = df_data['tempo']/max_tempo
df_data.head()


#%%

knn = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')
knn.fit(df_data.values)

#%%

def standardize(s):
    s = "".join([i.lower() for i in s if i not in frozenset(string.punctuation)])
    return s

#%%

def matching_genres(genres1, genres2):
    genres1 = literal_eval(genres1)
    matches = list(set(genres1) & set(genres2))
    return len(matches) > 0

#%%
movie_features_df = rating_popular_movie.pivot_table(index='title',
                                                     columns='userId',
                                                     values='rating').fillna(0)
movie_features_df.head()

from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors

#when data is spares doesnt use kdtree, auto use best algo
model_knn = NearestNeighbors(n_neighbors=6,
                             metric='euclidean',
                             algorithm='auto')
model_knn.fit(movie_features_df_matrix)

movie_features_df.shape

query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(
    movie_features_df.iloc[query_index, :].values.reshape(1, -1),
    n_neighbors=6)
movie_features_df.head()

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(
            movie_features_df.index[query_index]))
    else: