def _compute_centrality(loc_proto_lat, loc_proto_lon, locs_proto, df_c,
                        file_name_out):
    distances = [1000, 5000, 15000]
    centrality = []
    for d in distances:
        c = []

        neigh = NearestNeighbors(radius=d, metric=spherical_distance)
        neigh.fit(locs_proto)

        for i in range(len(loc_proto_lat)):
            loc_x = loc_proto_lat[i]
            loc_y = loc_proto_lon[i]

            rng = neigh.radius_neighbors([[loc_x, loc_y]])
            nei_index = list(rng[1][0])

            c.append(len(nei_index))

        centrality.append(c)

    df_c = df_c.assign(centrality1K=centrality[0],
                       centrality5K=centrality[1],
                       centrality15K=centrality[2])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c
Beispiel #2
0
def clusterpolate(points, values, targets, radius=1, kernel_factory=bump,
                  neighbors=None, num_jobs=None):
    """
    Clusterpolate data.

    ``points`` (array-like) are the data points and ``values``
    (array-like) are the associated values. ``targets`` (array-like) are
    the points at which the data should be clusterpolated.

    ``radius`` (float) is the radius of each data point's kernel.

    ``kernel_factory`` is a function that takes a radius and returns
    a corresponding kernel function. The kernel function must accept
    an array of distances (>= 0) and return the corresponding kernel
    values. The kernel function must be normalized (a distance of 0
    must yield a value of 1) and it should be zero for distances greater
    than ``radius``.

    Neighbor lookup is done using an instance of
    :py:class:`sklearn.neighbors.NearestNeighbors`, constructed with
    the default options. You can pass an instance that is configured
    to suit your data via the ``neighbors`` parameter.

    By default, computations are parallelized according to the number
    of available CPUs. Set ``num_jobs`` to a specific number to use
    more or fewer parallel processes.

    Returns two arrays. The first contains the predicted value for the
    corresponding target point, and the second contains the target
    point's degree of membership (a float between 0 and 1).
    """
    # Accept lists as inputs
    points = np.array(points)
    values = np.array(values)
    if points.shape[0] != values.shape[0]:
        raise ValueError('The numbers of points and values must match.')
    targets = np.array(targets)

    if neighbors is None:
        neighbors = sklearn.neighbors.NearestNeighbors(radius=radius)
    neighbors.fit(points)
    kernel = kernel_factory(radius)

    num_jobs = min(num_jobs or multiprocessing.cpu_count(), targets.shape[0])
    tasks = np.array_split(targets, num_jobs)
    values = _map(_worker, tasks, (neighbors, values, kernel))
    predictions = np.concatenate([v[0] for v in values])
    membership = np.concatenate([v[1] for v in values])
    return predictions, membership
Beispiel #3
0
def word2vec_model(unique_text: set,
                   min_count: int = 1,
                   window: int = 5,
                   n_neighbors: int = 2,
                   min_samples: int = 5,
                   verbose: bool = False):
    """
    Returns a word2vec model, a dbscan, clusters, the number of clusters, and the amount of noise
    """
    model = gensim.models.Word2Vec([unique_text],
                                   min_count=min_count,
                                   size=len(unique_text),
                                   window=window)
    vec = model.wv.vectors

    # Get optimal epsilon
    neighbors = sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors)
    nbrs = neighbors.fit(vec)
    distances, _ = nbrs.kneighbors(vec)
    distances = np.sort(distances[:, 1], axis=0)
    epsilon = np.average(distances[:len(distances) // 2])

    # Run cluster detection ting
    db = sklearn.cluster.DBSCAN(eps=epsilon, min_samples=min_samples).fit(vec)
    clusters = db.labels_
    n_clusters = len(set(clusters)) - (-1 in clusters)
    n_noise = list(clusters).count(-1)
    word_clusters = [
        np.array(unique_text)[clusters == i] for i in range(n_clusters)
    ]
    if verbose:
        print(f"Clusters: {n_clusters} | Noise: {n_noise}")

    return model, db, clusters, word_clusters, n_clusters, n_noise
def _compute_exclusivity(loc_proto_lat, loc_proto_lon, locs_proto, df_i,
                         file_name_out):
    neigh = NearestNeighbors(radius=200, metric=spherical_distance)
    neigh.fit(locs_proto)

    exclusivity = []

    for i in range(len(loc_proto_lat)):
        loc_x = loc_proto_lat[i]
        loc_y = loc_proto_lon[i]
        sup_loc = df_i.iloc[i]["support"]

        rng = neigh.radius_neighbors([[loc_x, loc_y]])
        nei_index = list(rng[1][0])
        support_list = df_i.iloc[nei_index]["support"]
        tot_sup = sum(support_list)

        exclusivity.append(sup_loc / tot_sup)

    df_c = pd.DataFrame(exclusivity, columns=['exclusivity'])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c
def _compute_rev_centrality(locs_proto, df_c, file_name_out):
    neigh = NearestNeighbors(n_neighbors=25, metric=spherical_distance)
    neigh.fit(locs_proto)

    n_neigh = [1, 3, 5, 8, 10, 20]
    rev_centrality = []
    distances, _ = neigh.kneighbors(locs_proto)

    for n in n_neigh:
        r = []
        for d in distances:
            r.append(d[n])
        rev_centrality.append(r)

    df_c = df_c.assign(rev_centrality1=rev_centrality[0],
                       rev_centrality3=rev_centrality[1],
                       rev_centrality5=rev_centrality[2],
                       rev_centrality8=rev_centrality[3],
                       rev_centrality10=rev_centrality[4],
                       rev_centrality20=rev_centrality[5])

    df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False)

    return df_c
Beispiel #6
0
tree = tree.DecisionTreeClassifier()
bayes = GaussianNB()
neighbors = neighbors.KNeighborsClassifier()
supportvc = svm.SVC()

# Train your data with Decision Tree 
# http://scikit-learn.org/stable/modules/tree.html
tree = tree.fit(train_X, train_Y)

# Train your data with Bayes
# http://scikit-learn.org/stable/modules/naive_bayes.html
bayes = bayes.fit(train_X, train_Y)

# Train your data with k neighbors 
# http://scikit-learn.org/stable/modules/neighbors.html
neighbors = neighbors.fit(train_X, train_Y)

# Train your data with Support Vector classifier
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
supportvc = supportvc.fit(train_X, train_Y)

test_X = [[150, 40, 30], [176, 69, 43], [188,92,48],[184,84,44],[183,83,44],
		  [166,47,36],[170,60,38],[172,64,39],[182,80,42],[180,80,43]]
test_Y = ['female', 'male', 'male','male','male','female','female',
	      'female','male','male']

tree_prediction = tree.predict(test_X)
bayes_prediction = bayes.predict(test_X)
neighbors_prediction = neighbors.predict(test_X)
supportvc_prediction = supportvc.predict(test_X)
Beispiel #7
0
print(classLabelArray.shape)
print(dataValues.shape)
transposedataValues = dataValues.T
training_accuracy = []
testing_accuracy = []

# Splitting the dataset as 75% Training and 25% Testing

trainX, testX, trainY, testY = train_test_split(transposedataValues,
                                                classLabelArray,
                                                test_size=0.25)

# Array Creation for K = 3
predictedArray = []

# Fitting the Data
neighbors = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform')
neighbors.fit(trainX, trainY)
neighbors_setting = range(0, 12)
# Predicting the Data

for nneighbors in neighbors_setting:
    testdatafetching = num.asarray(testX[i, :])
    testdatafetching = num.reshape(testdatafetching, (1, -1))
    print("KNN for K = 3")
    pred = neighbors.predict(testdatafetching)
    predictedArray.insert(i, pred[0])
    print(pred)
    print("Test Accuracy: {:.2f}".format(neighbors.score(testX, testY)))
def geographic_charac(df_poi, df_c, file_name_out):

    loc_proto_lat = df_c["loc_proto_lat"]
    loc_proto_lon = df_c["loc_proto_lon"]
    locs_proto = [list(x) for x in zip(loc_proto_lat, loc_proto_lon)]

    poi_lat = df_poi["lat"]
    poi_lon = df_poi["lon"]
    poi_coords = [list(x) for x in zip(poi_lat, poi_lon)]

    # get category list
    categories = [
        "gas", "parking", "pier", "hotel", "food", "leisure", "shop",
        "service", "supermarket"
    ]

    print("centrality poi")
    # count the number of poi for each category in a radius of 1 km
    centrality_poi = []
    neigh = NearestNeighbors(radius=500, metric=spherical_distance)
    neigh.fit(poi_coords)

    for i in range(len(loc_proto_lat)):
        loc_x = loc_proto_lat[i]
        loc_y = loc_proto_lon[i]

        rng = neigh.radius_neighbors([[loc_x, loc_y]])
        nei_index = list(rng[1][0])

        count_c = dict.fromkeys(categories, 0)

        for j in nei_index:
            p_c = df_poi.iloc[j]["category"]

            count_c[p_c] += 1
        centrality_poi.append(list(count_c.values()))

    df_g1 = pd.DataFrame(centrality_poi,
                         columns=["n_" + c for c in categories])

    df_g1.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    print("count_nearest_neighbour")
    knei_poi = []
    # count how many poi for each category there are in the top 10 nearest neighbours
    neigh = NearestNeighbors(n_neighbors=30, metric=spherical_distance)
    neigh.fit(poi_coords)

    distances, indices = neigh.kneighbors(locs_proto)
    # for each point take list of 11 neighbours indices
    for nei_index in indices:
        count_c = dict.fromkeys(categories, 0)

        for j in nei_index:
            p_c = df_poi.iloc[j]["category"]

            count_c[p_c] += 1

        knei_poi.append(list(count_c.values()))

    df_g2 = pd.DataFrame(knei_poi, columns=["k_" + c for c in categories])
    df_g = pd.concat([df_g1, df_g2], axis=1)

    df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    print("dist_nearest_neighbour")
    dist_poi = []
    # take the min distance for each category in the top 10 nearest neighbours
    for i, d in zip(indices, distances):
        dist_c = dict.fromkeys(categories, 10000)

        for j in range(len(d)):
            p_c = df_poi.iloc[i[j]]["category"]

            if d[j] < dist_c[p_c]:
                dist_c[p_c] = d[j]

        dist_poi.append(list(dist_c.values()))

    df_g3 = pd.DataFrame(dist_poi, columns=["d_" + c for c in categories])
    df_g = pd.concat([df_g, df_g3], axis=1)

    df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False)

    return df_g
#[height, weight, shoe size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 39], [171, 75, 42],
     [181, 85, 43]]

#gender
Y = [
    'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female',
    'male', 'female', 'male'
]

#create variables with classifiers
tree = tree.DecisionTreeClassifier()
neighbors = neighbors.KNeighborsClassifier()
randomForest = ensemble.RandomForestClassifier()

#fit them
tree = tree.fit(X, Y)
neighbors = neighbors.fit(X, Y)
randomForest = randomForest.fit(X, Y)

#create prediction variables for results
predictionForTree = tree.predict([[167, 63, 41]])
predictionForNeighbors = neighbors.predict([[167, 63, 41]])
predictionForRandomForest = randomForest.predict([[167, 63, 41]])

#print all the results
print("DecisionTree: ", predictionForTree)
print("NearestNeighbors: ", predictionForNeighbors)
print("RandomForest: ", predictionForRandomForest)