def iterateSpots(): def nearbyLocations(lid, radiusInMiles): return (location for location in locationsCollection.find({"l": {"$within": {"$center": [getLocationFromLid(lid), convertMilesToRadians(radiusInMiles)]}}})) graph = nx.Graph() for lid in locationsForUsIterator(minUniqueUsersCheckedInTheLocation): for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid) for locations in nx.connected_components(graph): if len(locations)>=minimumLocationsPerSpot: yield getKMLForCluster(locations)
def iterateSpots(): locationsToCheck = set(list(locationsForUsIterator(minUniqueUsersCheckedInTheLocation))) graph = nx.Graph() for e in locationToLocationCollection.find(): d = e['_id'].split() l1, l2 = ' '.join(d[:2]), ' '.join(d[2:]) if l1 in locationsToCheck and l2 in locationsToCheck and e['d']<=graphNodesDistanceInMiles: graph.add_edge(l1.replace(' ', '_'), l2.replace(' ', '_'), {'w': e['u']}) for locations in nx.connected_components(graph): if len(locations)>=minimumLocationsPerSpot: clusters = clusterUsingMCLClustering(graph.subgraph(locations), inflation=20) print graph.subgraph(locations).number_of_nodes(), graph.subgraph(locations).number_of_edges(), len(clusters) for cluster in clusters: if len(cluster)>=minimumLocationsPerSpot: yield getKMLForCluster([c.replace('_', ' ') for c in cluster])
return dict((u['user'], dict(sorted(u['locations'].iteritems(), key=itemgetter(1), reverse=True)[:10000])) for u in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)) def getDayBlockMeansForClusters(users, userClusterMap): completeDayBlockDistribution = defaultdict(list) for user in users: dayBlockDistributionForUser = [] for day in users[user]: dayBlockDistributionForUser+=[int(dayBlock) for dayBlock in users[user][day] for i in range(users[user][day][dayBlock])] completeDayBlockDistribution[userClusterMap[user]]+=dayBlockDistributionForUser return [(k, np.mean(completeDayBlockDistribution[k]), np.std(completeDayBlockDistribution[k])) for k in completeDayBlockDistribution] def getAverageDistanceBetweenClusters(meanDayblockValues): return np.mean([np.abs(m1-m2) for m1, m2 in combinations(meanDayblockValues,2)]) userVectors = getUserVectors() locationsInUS = set(list(locationsForUsIterator(minUniqueUsersCheckedInTheLocation))) def clusterLocation(location): dimensions = defaultdict(int) for u in location['users']: for lid in userVectors[u]: dimensions[lid]+=1 dimensions = [d for d in dimensions if dimensions[d]>=2] userVectorsToCluster = [(u, ' '.join([l.replace(' ', '_') for l in userVectors[u] if l in dimensions for j in range(userVectors[u][l])])) for u in location['users']] resultsForVaryingK = [] for k in range(2,6): try: cluster = KMeansClustering(userVectorsToCluster, k).cluster() userClusterMap = dict((k1,v) for k1,v in zip(location['users'], cluster)) dayBlockMeansForClusters = getDayBlockMeansForClusters(location['users'], userClusterMap) userClusterMap = dict([(str(k2), v) for k2, v in userClusterMap.iteritems()]) resultsForVaryingK.append([k, userClusterMap, zip(*dayBlockMeansForClusters)[1:], getAverageDistanceBetweenClusters(zip(*dayBlockMeansForClusters)[1])])