def getStats(spotsFile, userToLocationVector): lidToSpotIdMap, userToSpotIdMap, spotMap, spotsWithUsersFile = {}, {}, defaultdict(dict), spotsFile + "_users" for spot in FileIO.iterateJsonFromFile(spotsWithUsersFile): for location, _ in spot["lids"]: lidToSpotIdMap[getLidFromLocation(location)] = spot["id"] for user in spot["users"]: userToSpotIdMap[user] = spot["id"] observedUsers = set() for userVector in userToLocationVector: user = userVector["user"] assert user not in observedUsers if user in userToSpotIdMap: assignment = [ [lidToSpotIdMap[lid]] * userVector["locations"][lid] for lid in userVector["locations"] if lid in lidToSpotIdMap ] spotMap[userToSpotIdMap[user]][user] = [item for t in assignment for item in t] observedUsers.add(user) accuracyList = [] for spotId, userMap in spotMap.iteritems(): totalAssignments, wrongAssignments = 0.0, 0.0 for user in userMap: for a in userMap[user]: if a != spotId: wrongAssignments += 1 totalAssignments += 1 accuracyList.append(wrongAssignments / totalAssignments) return { "accuracy": np.mean(accuracyList), "total_locations": len(lidToSpotIdMap), "total_users": len(userToSpotIdMap), }
def mapper(self, key, line): data = parseData(line) if data and isWithinBoundingBox(data['l'], boundary): del data['_id'] data['t'] = time.mktime(data['t'].timetuple()) data['lid'] = getLidFromLocation(data['l']) data['llid'] = getLatticeLid(data['l'], accuracy=0.015) yield data, 1
def addVenuesMetaToDB(): i = 0 for data in open(venuesFile): data = data.strip().split('\t') # print data[10].replace('\\', ''), data[11].replace('\\', '') try: venuesMetaDataCollection.insert({'_id': getLidFromLocation([float(data[2]), float(data[3])]), 'c': data[10].replace('\\', ''), 't':data[11].replace('\\', '') }) except Exception as e: print i, 'Exception while processing:', data; i+=1
def clusterSpot(spot): dimensions = [getLidFromLocation(l) for l, n in spot['lids']] userVectorsToCluster = [(u, ' '.join([l.replace(' ', '_') for l in userVectors[u] if l in dimensions for j in range(userVectors[u][l])])) for u in spot['users']] resultsForVaryingK = [] for k in range(2,6): # try: cluster = KMeansClustering(userVectorsToCluster, k).cluster() print '$$$$$$$$$$', cluster userClusterMap = dict((k1,v) for k1,v in zip(spot['users'], cluster)) dayBlockMeansForClusters = getDayBlockMeansForClusters(spot['users'], userClusterMap) print dayBlockMeansForClusters
def assignUserToSpots(spotsFile, userToLocationVector): lidToSpotIdMap, userDistributionInSpots, spotsWithUsersFile = {}, defaultdict(list), spotsFile + "_users" for spot in FileIO.iterateJsonFromFile(spotsFile): for location, _ in spot["spot"]: lidToSpotIdMap[getLidFromLocation(location)] = spot["id"] userDistributionInSpots[spot["id"]] = {"id": spot["id"], "lids": spot["spot"], "users": []} for userObject in userToLocationVector: userId, userVector = userObject["user"], userObject["locations"] for lid in userVector: if lid in lidToSpotIdMap: userDistributionInSpots[lidToSpotIdMap[lid]]["users"].append(userId) for spotId, object in userDistributionInSpots.iteritems(): print spotId FileIO.writeToFileAsJson(object, spotsWithUsersFile)
def plotHashtagSourcesOnMap(timeRange, outputFolder): i = 1 distribution = defaultdict(int) for hashtagObject in FileIO.iterateJsonFromFile(hashtagsFile%(outputFolder,'%s_%s'%timeRange)): occuranesInHighestActiveRegion, isFirstActiveRegion = getOccuranesInHighestActiveRegion(hashtagObject, True) if occuranesInHighestActiveRegion: source, count = getSourceLattice(occuranesInHighestActiveRegion) print i, source;i+=1 distribution[getLidFromLocation(source)]+=1 # if i==10: break points, colors = zip(*[(getLocationFromLid(k),v) for k, v in sorted(distribution.iteritems(), key=itemgetter(1))]) cm = matplotlib.cm.get_cmap('Paired') sc = plotPointsOnWorldMap(points, c=colors, cmap=cm, lw = 0) plt.colorbar(sc) plt.show()
def writeUserDistributionInSpots(spotsFile, userToLocationVector): lidToSpotIdMap, userDistributionInSpots, spotsWithUsersFile = {}, defaultdict(list), spotsFile + "_users" for spot in FileIO.iterateJsonFromFile(spotsFile): for location, _ in spot["spot"]: lidToSpotIdMap[getLidFromLocation(location)] = spot["id"] userDistributionInSpots[spot["id"]] = {"id": spot["id"], "lids": spot["spot"], "users": []} for userObject in userToLocationVector: userId, userVector = userObject["user"], userObject["locations"] spotDistribution = defaultdict(int) for lid in userVector: if lid in lidToSpotIdMap: spotDistribution[lidToSpotIdMap[lid]] += 1 * userVector[lid] if spotDistribution: spotId = sorted(spotDistribution.iteritems(), key=itemgetter(1))[-1][0] userDistributionInSpots[spotId]["users"].append(userId) for spotId, object in userDistributionInSpots.iteritems(): FileIO.writeToFileAsJson(object, spotsWithUsersFile)
def mapper(self, key, line): data = parseData(line) if data: yield getLidFromLocation(data['l']), data['u']
def collectLocationsMapper(self, key, line): data = parseData(line) if data: yield data['u'], getLidFromLocation(data['l'])
def mapper(self, key, line): data = parseData(line) if data: d = data['t'] yield getLidFromLocation(data['l']), '_'.join([str(data['u']), str(d.weekday()), str(d.hour/4)])
def addVenuesToDB(): i = 0 for data in open(venuesFile): data = data.strip().split('\t') try: venuesCollection.insert({'_id': int(data[0]), 'n': data[1], 'l': [float(data[2]), float(data[3])], 'lid': getLidFromLocation([float(data[2]), float(data[3])]), 'm':' '.join(data[4:-2]), 'tp': int(data[-2]), 'tc': int(data[-1])}) except Exception as e: print i, 'Exception while processing:', data; i+=1
def addCheckinsToDB(): i = 0 for data in open(checkinsFile): data = data.strip().split('\t') try: if len(data)!=7: data.append(None) if len(data) == 7: checkinsCollection.insert({'_id':int(data[1]), 'u': int(data[0]), 'l': [float(data[2]), float(data[3])], 'lid': getLidFromLocation([float(data[2]), float(data[3])]), 't': dateutil.parser.parse(data[4]), 'x': data[5], 'pid': data[6]}) except Exception as e: print i, 'Exception while processing:', data; i+=1