Python fit Examples, sklearn.neighbors.fit Python Examples

Example #1

0

Show file

File: build_loc_feat.py Project: fsbolgi/AnnotatedIndividualMobilityNetwork

def _compute_centrality(loc_proto_lat, loc_proto_lon, locs_proto, df_c,
                        file_name_out):
    distances = [1000, 5000, 15000]
    centrality = []
    for d in distances:
        c = []

        neigh = NearestNeighbors(radius=d, metric=spherical_distance)
        neigh.fit(locs_proto)

        for i in range(len(loc_proto_lat)):
            loc_x = loc_proto_lat[i]
            loc_y = loc_proto_lon[i]

            rng = neigh.radius_neighbors([[loc_x, loc_y]])
            nei_index = list(rng[1][0])

            c.append(len(nei_index))

        centrality.append(c)

    df_c = df_c.assign(centrality1K=centrality[0],
                       centrality5K=centrality[1],
                       centrality15K=centrality[2])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c

Example #2

0

Show file

File: __init__.py Project: torfsen/clusterpolate

def clusterpolate(points, values, targets, radius=1, kernel_factory=bump,
                  neighbors=None, num_jobs=None):
    """
    Clusterpolate data.

    ``points`` (array-like) are the data points and ``values``
    (array-like) are the associated values. ``targets`` (array-like) are
    the points at which the data should be clusterpolated.

    ``radius`` (float) is the radius of each data point's kernel.

    ``kernel_factory`` is a function that takes a radius and returns
    a corresponding kernel function. The kernel function must accept
    an array of distances (>= 0) and return the corresponding kernel
    values. The kernel function must be normalized (a distance of 0
    must yield a value of 1) and it should be zero for distances greater
    than ``radius``.

    Neighbor lookup is done using an instance of
    :py:class:`sklearn.neighbors.NearestNeighbors`, constructed with
    the default options. You can pass an instance that is configured
    to suit your data via the ``neighbors`` parameter.

    By default, computations are parallelized according to the number
    of available CPUs. Set ``num_jobs`` to a specific number to use
    more or fewer parallel processes.

    Returns two arrays. The first contains the predicted value for the
    corresponding target point, and the second contains the target
    point's degree of membership (a float between 0 and 1).
    """
    # Accept lists as inputs
    points = np.array(points)
    values = np.array(values)
    if points.shape[0] != values.shape[0]:
        raise ValueError('The numbers of points and values must match.')
    targets = np.array(targets)

    if neighbors is None:
        neighbors = sklearn.neighbors.NearestNeighbors(radius=radius)
    neighbors.fit(points)
    kernel = kernel_factory(radius)

    num_jobs = min(num_jobs or multiprocessing.cpu_count(), targets.shape[0])
    tasks = np.array_split(targets, num_jobs)
    values = _map(_worker, tasks, (neighbors, values, kernel))
    predictions = np.concatenate([v[0] for v in values])
    membership = np.concatenate([v[1] for v in values])
    return predictions, membership

Example #3

0

Show file

File: models.py Project: lightspeedana/note-o-matic

def word2vec_model(unique_text: set,
                   min_count: int = 1,
                   window: int = 5,
                   n_neighbors: int = 2,
                   min_samples: int = 5,
                   verbose: bool = False):
    """
    Returns a word2vec model, a dbscan, clusters, the number of clusters, and the amount of noise
    """
    model = gensim.models.Word2Vec([unique_text],
                                   min_count=min_count,
                                   size=len(unique_text),
                                   window=window)
    vec = model.wv.vectors

    # Get optimal epsilon
    neighbors = sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors)
    nbrs = neighbors.fit(vec)
    distances, _ = nbrs.kneighbors(vec)
    distances = np.sort(distances[:, 1], axis=0)
    epsilon = np.average(distances[:len(distances) // 2])

    # Run cluster detection ting
    db = sklearn.cluster.DBSCAN(eps=epsilon, min_samples=min_samples).fit(vec)
    clusters = db.labels_
    n_clusters = len(set(clusters)) - (-1 in clusters)
    n_noise = list(clusters).count(-1)
    word_clusters = [
        np.array(unique_text)[clusters == i] for i in range(n_clusters)
    ]
    if verbose:
        print(f"Clusters: {n_clusters} | Noise: {n_noise}")

    return model, db, clusters, word_clusters, n_clusters, n_noise

Example #4

0

Show file

File: build_loc_feat.py Project: fsbolgi/AnnotatedIndividualMobilityNetwork

def _compute_exclusivity(loc_proto_lat, loc_proto_lon, locs_proto, df_i,
                         file_name_out):
    neigh = NearestNeighbors(radius=200, metric=spherical_distance)
    neigh.fit(locs_proto)

    exclusivity = []

    for i in range(len(loc_proto_lat)):
        loc_x = loc_proto_lat[i]
        loc_y = loc_proto_lon[i]
        sup_loc = df_i.iloc[i]["support"]

        rng = neigh.radius_neighbors([[loc_x, loc_y]])
        nei_index = list(rng[1][0])
        support_list = df_i.iloc[nei_index]["support"]
        tot_sup = sum(support_list)

        exclusivity.append(sup_loc / tot_sup)

    df_c = pd.DataFrame(exclusivity, columns=['exclusivity'])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c

Example #5

0

Show file

File: build_loc_feat.py Project: fsbolgi/AnnotatedIndividualMobilityNetwork

def _compute_rev_centrality(locs_proto, df_c, file_name_out):
    neigh = NearestNeighbors(n_neighbors=25, metric=spherical_distance)
    neigh.fit(locs_proto)

    n_neigh = [1, 3, 5, 8, 10, 20]
    rev_centrality = []
    distances, _ = neigh.kneighbors(locs_proto)

    for n in n_neigh:
        r = []
        for d in distances:
            r.append(d[n])
        rev_centrality.append(r)

    df_c = df_c.assign(rev_centrality1=rev_centrality[0],
                       rev_centrality3=rev_centrality[1],
                       rev_centrality5=rev_centrality[2],
                       rev_centrality8=rev_centrality[3],
                       rev_centrality10=rev_centrality[4],
                       rev_centrality20=rev_centrality[5])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c

Example #6

0

Show file

tree = tree.DecisionTreeClassifier()
bayes = GaussianNB()
neighbors = neighbors.KNeighborsClassifier()
supportvc = svm.SVC()

# Train your data with Decision Tree 
# http://scikit-learn.org/stable/modules/tree.html
tree = tree.fit(train_X, train_Y)

# Train your data with Bayes
# http://scikit-learn.org/stable/modules/naive_bayes.html
bayes = bayes.fit(train_X, train_Y)

# Train your data with k neighbors 
# http://scikit-learn.org/stable/modules/neighbors.html
neighbors = neighbors.fit(train_X, train_Y)

# Train your data with Support Vector classifier
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
supportvc = supportvc.fit(train_X, train_Y)

test_X = [[150, 40, 30], [176, 69, 43], [188,92,48],[184,84,44],[183,83,44],
		  [166,47,36],[170,60,38],[172,64,39],[182,80,42],[180,80,43]]
test_Y = ['female', 'male', 'male','male','male','female','female',
	      'female','male','male']

tree_prediction = tree.predict(test_X)
bayes_prediction = bayes.predict(test_X)
neighbors_prediction = neighbors.predict(test_X)
supportvc_prediction = supportvc.predict(test_X)

Example #7

0

Show file

print(classLabelArray.shape)
print(dataValues.shape)
transposedataValues = dataValues.T
training_accuracy = []
testing_accuracy = []

# Splitting the dataset as 75% Training and 25% Testing

trainX, testX, trainY, testY = train_test_split(transposedataValues,
                                                classLabelArray,
                                                test_size=0.25)

# Array Creation for K = 3
predictedArray = []

# Fitting the Data
neighbors = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform')
neighbors.fit(trainX, trainY)
neighbors_setting = range(0, 12)
# Predicting the Data

for nneighbors in neighbors_setting:
    testdatafetching = num.asarray(testX[i, :])
    testdatafetching = num.reshape(testdatafetching, (1, -1))
    print("KNN for K = 3")
    pred = neighbors.predict(testdatafetching)
    predictedArray.insert(i, pred[0])
    print(pred)
    print("Test Accuracy: {:.2f}".format(neighbors.score(testX, testY)))

Example #8

0

Show file

File: build_loc_feat.py Project: fsbolgi/AnnotatedIndividualMobilityNetwork

def geographic_charac(df_poi, df_c, file_name_out):

    loc_proto_lat = df_c["loc_proto_lat"]
    loc_proto_lon = df_c["loc_proto_lon"]
    locs_proto = [list(x) for x in zip(loc_proto_lat, loc_proto_lon)]

    poi_lat = df_poi["lat"]
    poi_lon = df_poi["lon"]
    poi_coords = [list(x) for x in zip(poi_lat, poi_lon)]

    # get category list
    categories = [
        "gas", "parking", "pier", "hotel", "food", "leisure", "shop",
        "service", "supermarket"
    ]

    print("centrality poi")
    # count the number of poi for each category in a radius of 1 km
    centrality_poi = []
    neigh = NearestNeighbors(radius=500, metric=spherical_distance)
    neigh.fit(poi_coords)

    for i in range(len(loc_proto_lat)):
        loc_x = loc_proto_lat[i]
        loc_y = loc_proto_lon[i]

        rng = neigh.radius_neighbors([[loc_x, loc_y]])
        nei_index = list(rng[1][0])

        count_c = dict.fromkeys(categories, 0)

        for j in nei_index:
            p_c = df_poi.iloc[j]["category"]

            count_c[p_c] += 1
        centrality_poi.append(list(count_c.values()))

    df_g1 = pd.DataFrame(centrality_poi,
                         columns=["n_" + c for c in categories])

    df_g1.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    print("count_nearest_neighbour")
    knei_poi = []
    # count how many poi for each category there are in the top 10 nearest neighbours
    neigh = NearestNeighbors(n_neighbors=30, metric=spherical_distance)
    neigh.fit(poi_coords)

    distances, indices = neigh.kneighbors(locs_proto)
    # for each point take list of 11 neighbours indices
    for nei_index in indices:
        count_c = dict.fromkeys(categories, 0)

        for j in nei_index:
            p_c = df_poi.iloc[j]["category"]

            count_c[p_c] += 1

        knei_poi.append(list(count_c.values()))

    df_g2 = pd.DataFrame(knei_poi, columns=["k_" + c for c in categories])
    df_g = pd.concat([df_g1, df_g2], axis=1)

    df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    print("dist_nearest_neighbour")
    dist_poi = []
    # take the min distance for each category in the top 10 nearest neighbours
    for i, d in zip(indices, distances):
        dist_c = dict.fromkeys(categories, 10000)

        for j in range(len(d)):
            p_c = df_poi.iloc[i[j]]["category"]

            if d[j] < dist_c[p_c]:
                dist_c[p_c] = d[j]

        dist_poi.append(list(dist_c.values()))

    df_g3 = pd.DataFrame(dist_poi, columns=["d_" + c for c in categories])
    df_g = pd.concat([df_g, df_g3], axis=1)

    df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    return df_g

Example #9

0

Show file

File: demo.py Project: karthikkornalies/Classify-male-female-by-height-weight-shoesize

#[height, weight, shoe size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 39], [171, 75, 42],
     [181, 85, 43]]

#gender
Y = [
    'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female',
    'male', 'female', 'male'
]

#create variables with classifiers
tree = tree.DecisionTreeClassifier()
neighbors = neighbors.KNeighborsClassifier()
randomForest = ensemble.RandomForestClassifier()

#fit them
tree = tree.fit(X, Y)
neighbors = neighbors.fit(X, Y)
randomForest = randomForest.fit(X, Y)

#create prediction variables for results
predictionForTree = tree.predict([[167, 63, 41]])
predictionForNeighbors = neighbors.predict([[167, 63, 41]])
predictionForRandomForest = randomForest.predict([[167, 63, 41]])

#print all the results
print("DecisionTree: ", predictionForTree)
print("NearestNeighbors: ", predictionForNeighbors)
print("RandomForest: ", predictionForRandomForest)