def iterateFrequentLocationsFromFIMahout( minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport, minLocationsInItemset=0, extraMinSupport=minSupport, yieldSupport=False, lids=False, ): # for line in FileIO.iterateLinesFromFile(locationsFIMahoutOutputFile%(minUserLocations, minCalculatedSupport)): for line in FileIO.iterateLinesFromFile( locationsFIMahoutOutputFile % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport) ): if line.startswith("Key:"): data = line.split("Value: ")[1][1:-1].split(",") if not lids: locationItemset, support = ( [getLocationFromLid(i.replace("_", " ")) for i in data[0][1:-1].split()], int(data[1]), ) else: locationItemset, support = [i.replace("_", " ") for i in data[0][1:-1].split()], int(data[1]) if support >= extraMinSupport and len(locationItemset) >= minLocationsInItemset: if not yieldSupport: yield [location for location in locationItemset if isWithinBoundingBox(location, us_boundary)] else: yield [ location for location in locationItemset if isWithinBoundingBox(getLocationFromLid(location), us_boundary) ], support
def load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary=[[-90,-180], [90, 180]], noOfInfluencers=None): mf_location_to_global_influence_score = {} mf_location_to_mf_influence_type_to_influence_score = defaultdict(dict) mf_location_to_tuo_neighbor_location_and_locations_influencing_score = \ dict(Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE)) mf_location_to_tuo_neighbor_location_and_locations_influenced_score = \ dict(Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE)) for location in mf_location_to_tuo_neighbor_location_and_locations_influenced_score.keys()[:]: if not isWithinBoundingBox(getLocationFromLid(location.replace('_', ' ')), boundary): if location in mf_location_to_tuo_neighbor_location_and_locations_influencing_score: del mf_location_to_tuo_neighbor_location_and_locations_influencing_score[location] del mf_location_to_tuo_neighbor_location_and_locations_influenced_score[location] no_of_locations = len(mf_location_to_tuo_neighbor_location_and_locations_influenced_score) for location, tuo_neighbor_location_and_locations_influencing_score in \ mf_location_to_tuo_neighbor_location_and_locations_influencing_score.iteritems(): mf_location_to_mf_influence_type_to_influence_score[location][InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] \ = sum(zip(*tuo_neighbor_location_and_locations_influencing_score)[1])/no_of_locations for location, tuo_neighbor_location_and_locations_influenced_score in \ mf_location_to_tuo_neighbor_location_and_locations_influenced_score.iteritems(): mf_location_to_mf_influence_type_to_influence_score[location][InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE] \ = sum(zip(*tuo_neighbor_location_and_locations_influenced_score)[1])/no_of_locations for location, mf_influence_type_to_influence_score in \ mf_location_to_mf_influence_type_to_influence_score.iteritems(): influence_type, influence_score = max(mf_influence_type_to_influence_score.iteritems(), key=itemgetter(1)) if influence_type==InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE: mf_location_to_global_influence_score[location] = -influence_score else: mf_location_to_global_influence_score[location] = influence_score return mf_location_to_global_influence_score.items()
def mapper(self, key, line): data = parseData(line) if data and isWithinBoundingBox(data['l'], boundary): del data['_id'] data['t'] = time.mktime(data['t'].timetuple()) data['lid'] = getLidFromLocation(data['l']) data['llid'] = getLatticeLid(data['l'], accuracy=0.015) yield data, 1
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS) if isWithinBoundingBox(l, BOUNDARY): point = getLatticeLid(l, LATTICE_ACCURACY) if point!='0.0000_0.0000': for h in data['h']: yield h.lower(), [point, t]
def getHashtagWithoutEndingWindow(key, values, specificToArea=False): occurences = [] for instances in values: if not specificToArea: occurences+=instances['oc'] else: MIN_HASHTAG_OCCURENCES = AREA_DETAILS[1] for oc in instances['oc']: if isWithinBoundingBox(oc[0], AREA_DETAILS[0]): occurences.append(oc) if occurences: e, l = min(occurences, key=lambda t: t[1]), max(occurences, key=lambda t: t[1]) numberOfInstances=len(occurences) if numberOfInstances>=MIN_HASHTAG_OCCURENCES and \ e[1]>=HASHTAG_STARTING_WINDOW: return {'h': key, 't': numberOfInstances, 'e':e, 'l':l, 'oc': sorted(occurences, key=lambda t: t[1])}
def analyzeDataClusters(): regex = 'cafe' neighborLocationExtractionMethod = NeighborLocationsSelection.N_LOCATIONS inputFile = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/'+regex for line in FileIO.iterateJsonFromFile(inputFile): if line['parameters']['checkinsWindow']==10: for location, data in line['locations'].iteritems(): # data = line['locations']['41.895 -87.623'] if isWithinBoundingBox(getLocationFromLid(location), us_boundary): print venuesCollection.find_one({'lid': location})['n'], location,'\n' for l, _ in data['clusters'][:5]: print [i[0] for i in l] print '\n ********** \n'
def influence_clusters(model_ids, min_cluster_size=15): influence_type = InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE for model_id in model_ids: digraph_of_location_and_location_similarity = nx.DiGraph() for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \ enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)): # print line_count for neighbor_location, mf_influence_type_to_similarity in tuo_neighbor_location_and_mf_influence_type_and_similarity: if isWithinBoundingBox(getLocationFromLid(location.replace('_', ' ')), PARTIAL_WORLD_BOUNDARY) and \ isWithinBoundingBox(getLocationFromLid(neighbor_location.replace('_', ' ')), PARTIAL_WORLD_BOUNDARY): digraph_of_location_and_location_similarity.add_edge(location, neighbor_location, {'w': mf_influence_type_to_similarity[influence_type]}) no_of_clusters, tuo_location_and_cluster_id = clusterUsingAffinityPropagation(digraph_of_location_and_location_similarity) tuo_cluster_id_to_locations = [ (cluster_id, zip(*ito_tuo_location_and_cluster_id)[0]) for cluster_id, ito_tuo_location_and_cluster_id in groupby( sorted(tuo_location_and_cluster_id, key=itemgetter(1)), key=itemgetter(1) ) ] mf_location_to_cluster_id = dict(tuo_location_and_cluster_id) mf_cluster_id_to_cluster_color = dict([(i, GeneralMethods.getRandomColor()) for i in range(no_of_clusters)]) mf_valid_locations_to_color = {} for cluster_id, locations in \ sorted(tuo_cluster_id_to_locations, key=lambda (cluster_id, locations): len(locations))[-10:]: # if len(locations)>min_cluster_size: print cluster_id, len(locations) for location in locations: mf_valid_locations_to_color[location] \ = mf_cluster_id_to_cluster_color[mf_location_to_cluster_id[location]] locations, colors = zip(*mf_valid_locations_to_color.iteritems()) locations = [getLocationFromLid(location.replace('_', ' ')) for location in locations] _, m = plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, s=0, returnBaseMapObject=True, lw = 0) for u, v, data in digraph_of_location_and_location_similarity.edges(data=True): if u in mf_valid_locations_to_color and v in mf_valid_locations_to_color \ and mf_location_to_cluster_id[u]==mf_location_to_cluster_id[v]: color, u, v, w = mf_cluster_id_to_cluster_color[mf_location_to_cluster_id[u]], getLocationFromLid(u.replace('_', ' ')), getLocationFromLid(v.replace('_', ' ')), data['w'] m.drawgreatcircle(u[1], u[0], v[1], v[0], color=color, alpha=0.6) plt.show()
def writeLocationToUserMap(place): name, boundary = place['name'], place['boundary'] GeneralMethods.runCommand('rm -rf %s'%placesLocationToUserMapFile%name) for location in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, inputFile=locationToUserAndExactTimeMapFile): lid=getLocationFromLid(location['location']) if isWithinBoundingBox(lid, boundary): location['categories'] = ''; location['tags'] = ''; location['name']='' title = venuesCollection.find_one({'lid':location['location']}) if title: location['name'] = unicode(title['n']).encode("utf-8") meta = venuesMetaDataCollection.find_one({'_id':location['location']}) if meta: location['categories'] = unicode(meta['c']).encode("utf-8"); location['tags'] = unicode(meta['t']).encode("utf-8") for user in location['users'].keys()[:]: location['users'][str(user)]=location['users'][user]; del location['users'][user] location['noOfCheckins']=sum([len(epochs) for user, userVector in location['users'].iteritems() for day, dayVector in userVector.iteritems() for db, epochs in dayVector.iteritems()]) if location['noOfCheckins']>place.get('minLocationCheckins',0): FileIO.writeToFileAsJson(location, placesLocationToUserMapFile%name)
def writeGraphs(regex, neighborLocationExtractionMethod, **kwargs): def getLocationName(lid): object = venuesCollection.find_one({'lid': lid}) if object: return object['n'] else: return lid inputFileName = checkinSequenceLocationRegexFolder+regex outputFileName = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/'+regex for data in FileIO.iterateJsonFromFile(inputFileName): if isWithinBoundingBox(getLocationFromLid(data['lid']), world_boundary): outputFileName = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/graph/'+regex+'/'+data['lid']+'_%s'%kwargs['checkinsWindow'] print 'Analyzing:', kwargs['checkinsWindow'], data['lid'], outputFileName graph = NeigboringLocationsGraph.getLocationGraph(data, NeighborLocationsSelection.getMethod(neighborLocationExtractionMethod), **kwargs) labels, edgeWeights = {}, [] for u, v in graph.edges()[:]: if u==v: graph.remove_edge(u, v) else: edgeWeights.append((graph.edge[u][v]['w'], (u,v))) edgesToRemove = [i[1] for i in sorted(edgeWeights, key=itemgetter(0), reverse=True)[int(len(edgeWeights)*kwargs['percentageOfTopEdgesByWeight']):]] for u, v in edgesToRemove: graph.remove_edge(u, v) for u in graph.nodes(): if graph.degree(u)==0: graph.remove_node(u) else: labels[u] = unicode(getLocationName(u)).encode('utf-8') plot(graph, node_color='#A0CBE2',width=4,edge_cmap=plt.cm.Blues,with_labels=True,labels=labels)
import os, gzip, cjson from library.twitter import TweetFiles from library.file_io import FileIO from library.geo import isWithinBoundingBox from settings import us_boundary checkinsFile = 'checkins/%s' def tweetFilesIterator(): bdeDataFolder = '/mnt/chevron/bde/Data/TweetData/GeoTweets/2011/%s/%s/' for month in range(2, 12): outputFile = checkinsFile%month for day in range(1, 32): tweetsDayFolder = bdeDataFolder%(month, day) if os.path.exists(tweetsDayFolder): for _, _, files in os.walk(tweetsDayFolder): for file in files: yield outputFile, tweetsDayFolder+file for outputFile, file in tweetFilesIterator(): print 'Parsing: %s'%file for line in gzip.open(file, 'rb'): try: data = cjson.decode(line) if 'geo' in data and data['geo']!=None: if isWithinBoundingBox(data['geo']['coordinates'], us_boundary): checkin = {'geo': data['geo']['coordinates'], 'user': {'id': data['user']['id'], 'l': data['user']['location']}, 'id': data['id'], 't': data['created_at'], 'h': [], 'tx': data['text']} for h in data['entities']['hashtags']: checkin['h'].append(h['text']) # print checkin FileIO.writeToFileAsJson(checkin, outputFile) except Exception as e: print e
def locationsForUsIterator(minUniqueUsersCheckedInTheLocation): return (data['location'] for data in FileIO.iterateJsonFromFile(locationByUserDistributionFile) if data['count']>=minUniqueUsersCheckedInTheLocation and isWithinBoundingBox(getLocationFromLid(data['location']), us_boundary))
def latticeIdInValidAreas(latticeId): point = getLocationFromLid(latticeId.replace('_', ' ')) for boundary in BOUNDARIES: if isWithinBoundingBox(point, boundary): return True
def filter_latticeObjectsByBoundaryOny(self, key, values): latticeObject = list(values)[0] if isWithinBoundingBox(getLocationFromLid(latticeObject['llid'].replace('_', ' ')), BOUNDARY): yield key, latticeObject
def filter_latticeObjects(self, key, values): latticeObject = list(values)[0] total = len(latticeObject['c']) if total>=MINIMUM_NO_OF_CHECKINS_PER_LOCATION and \ isWithinBoundingBox(getLocationFromLid(latticeObject['llid'].replace('_', ' ')), BOUNDARY): yield key, latticeObject
def get_valid_location((location, mf_model_id_to_hashtags)): location = getLocationFromLid(location.replace('_', ' ')) return isWithinBoundingBox(location, US_BOUNDARY)