def mapper(self, key, line): data = parseData(line) if data and isWithinBoundingBox(data['l'], boundary): del data['_id'] data['t'] = time.mktime(data['t'].timetuple()) data['lid'] = getLidFromLocation(data['l']) data['llid'] = getLatticeLid(data['l'], accuracy=0.015) yield data, 1
def iterate_hashtag_occurrences_with_high_accuracy_lid(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) lid = getLatticeLid(l, accuracy=0.0001) for h in data['h']: yield h.lower(), [lid, GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS)]
def get_user_from_checkins(self, key, line): data, hashtags = getCheckinObject(line), defaultdict(int) # if isWithinBoundingBox(data['l'], BOUNDARY): data['u'] = data['user']['id'] for k in ['tx', 'user']: del data[k] data['llid'] = getLatticeLid(data['l'], accuracy=ACCURACY) for h in data['h']: hashtags[h.lower()]+=1 data['h'] = hashtags yield data['u'], data
def filterLatticesByMinHashtagOccurencesPerLattice(h): latticesToOccurancesMap = defaultdict(list) for l, oc in h["oc"]: lid = getLatticeLid(l, LOCATION_ACCURACY) if lid != "0.0000_0.0000": latticesToOccurancesMap[lid].append(oc) return dict( [(k, v) for k, v in latticesToOccurancesMap.iteritems() if len(v) >= MIN_HASHTAG_OCCURENCES_PER_LATTICE] )
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if "geo" in data: l = data["geo"] else: l = data["bb"] point = getLatticeLid(l, LATTICE_ACCURACY) if point == "0.0000_0.0000": yield point, l
def iterateHashtagObjectInstances(line): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS) if isWithinBoundingBox(l, BOUNDARY): point = getLatticeLid(l, LATTICE_ACCURACY) if point!='0.0000_0.0000': for h in data['h']: yield h.lower(), [point, t]
def load_ltuo_hashtag_and_ltuo_location_and_occurrence_time(startTime=datetime(2012, 1, 1), endTime=datetime(2012, 3, 31), outputFolder='complete_prop'): ltuo_hashtag_and_ltuo_location_and_occurrence_time = [] for hashtag_object in \ iterateJsonFromFile(f_hashtag_objects%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))): ltuo_location_and_occurrence_time = [ (getLatticeLid(point, LOCATION_ACCURACY), occurrence_time) for point, occurrence_time in hashtag_object['oc'] ] ltuo_hashtag_and_ltuo_location_and_occurrence_time.append([hashtag_object['h'], ltuo_location_and_occurrence_time]) return ltuo_hashtag_and_ltuo_location_and_occurrence_time
def iterateHashtagObjectInstances(line, all_locations = False): data = cjson.decode(line) l = None if 'geo' in data: l = data['geo'] else: l = data['bb'] t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()) point = getLattice(l, LOCATION_ACCURACY) if not all_locations: lattice_lid = getLatticeLid(point, LOCATION_ACCURACY) if lattice_lid in VALID_LOCATIONS_LIST: for h in data['h']: yield h.lower(), [point, t] else: for h in data['h']: yield h.lower(), [point, t]
def plotSharingProbabilityAndTemporalClosenessScoresOnMap(timeRange, outputFolder): i = 1 for latticeObject in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%(outputFolder,'%s_%s'%timeRange)): latticePoint = getLocationFromLid(latticeObject['id'].replace('_', ' ')) latticeId = getLatticeLid([latticePoint[1], latticePoint[0]], LATTICE_ACCURACY) plt.subplot(211) plt.title(latticeId) LatticeGraphPlots.plotLatticeSharingProbabilityOnMap(LatticeGraph.typeSharingProbability, latticeObject) plt.subplot(212) LatticeGraphPlots.plotLatticeTemporalClosenessScoresOnMap(LatticeGraph.typeTemporalCloseness, latticeObject) plt.show() outputFile = hashtagsImagesGraphAnalysisFolder%outputFolder+'%s_and_%s/%s.png'%(LatticeGraph.typeSharingProbability['id'], LatticeGraph.typeTemporalCloseness['id'], latticeId); FileIO.createDirectoryForFile(outputFile) print i, outputFile; i+=1
def buildLatticeGraphMap(self, key, hashtagObject): hashtagObject['oc']=getOccuranesInHighestActiveRegion(hashtagObject) lattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys() latticesToOccranceTimeMap = {} for k, v in hashtagObject['oc']: lid = getLatticeLid(k, LATTICE_ACCURACY) if lid!='0.0000_0.0000' and lid in lattices: if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v lattices = latticesToOccranceTimeMap.items() if lattices: hastagStartTime, hastagEndTime = min(lattices, key=itemgetter(1))[1], max(lattices, key=itemgetter(1))[1] hashtagTimePeriod = hastagEndTime - hastagStartTime for lattice in lattices: yield lattice[0], ['h', [[hashtagObject['h'], [lattice[1], hashtagTimePeriod]]]] yield lattice[0], ['n', lattices]
def buildLocationTemporalClosenessGraphMap(self, key, hashtagObject): occuranesInHighestActiveRegion, latticesToOccranceTimeMap = getOccuranesInHighestActiveRegion(hashtagObject), {} hashtagObject['oc']=occuranesInHighestActiveRegion validLattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys() for k, v in occuranesInHighestActiveRegion: lid = getLatticeLid(k, ACCURACY) if lid in validLattices: if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v if latticesToOccranceTimeMap: latticesOccranceTimeList = latticesToOccranceTimeMap.items() hastagStartTime, hastagEndTime = min(latticesOccranceTimeList, key=itemgetter(1))[1], max(latticesOccranceTimeList, key=itemgetter(1))[1] hashtagTimePeriod = hastagEndTime - hastagStartTime if hashtagTimePeriod: for l1, l2 in combinations(latticesOccranceTimeList, 2): score = temporalScore(np.abs(l1[1]-l2[1]),hashtagTimePeriod) if score>=MIN_TEMPORAL_CLOSENESS_SCORE: yield l1[0], [hashtagObject['h'], [l2[0], score]] yield l2[0], [hashtagObject['h'], [l1[0], score]]
def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy): occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc) higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True) for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' '))) for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1)) occurancesToReturn = [] currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances)) for hl, occs in higherLattices: higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl} while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5: (l, d) = distanceMatrix[hl][0]; distanceMatrix[hl]=distanceMatrix[hl][1:] higherLatticeSet['distance']+=d higherLatticeSet['lattices'].append(l) higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l]) if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l] # return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
def getLocalityIndexAtK(occurances, kValue): ''' Locality index at k - for a hashtag is the minimum radius that covers k percentage of occurrances. A high locality index suggests hashtag was global with a small index suggests it was local. To find locality index at k, I must find a point that is closest to k percentage of occurances. Brute force requires nC2 complexity. Hence, use lattices of bigger size technique. ''' def getLatticeThatGivesMinimumLocalityIndexAtK(): occurancesDict = {'occurances': occurances} for accuracy in [4, 2, 1, 0.5, ACCURACY]: occurancesDict = getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurancesDict['occurances'], accuracy) return occurancesDict['sourceLattice'] def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy): occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc) higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True) for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' '))) for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1)) occurancesToReturn = [] currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances)) for hl, occs in higherLattices: higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl} while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5: (l, d) = distanceMatrix[hl][0]; distanceMatrix[hl]=distanceMatrix[hl][1:] higherLatticeSet['distance']+=d higherLatticeSet['lattices'].append(l) higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l]) if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l] # return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))} occurancesDistributionInHigherLattice = defaultdict(int) for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, ACCURACY)]+=1 totalOccurances, distance, observedOccuraces = float(len(occurances)), 0, 0 lattice = getLatticeThatGivesMinimumLocalityIndexAtK() sortedLatticeObjects = sorted([(getLocationFromLid(k.replace('_', ' ')), getHaversineDistance(lattice, getLocationFromLid(k.replace('_', ' '))), v) for k, v in occurancesDistributionInHigherLattice.iteritems()], key=itemgetter(1)) for l, d, oc in sortedLatticeObjects: distance=d; observedOccuraces+=oc if observedOccuraces/totalOccurances>=kValue: break return (d, lattice)
def buildLocationInAndOutTemporalClosenessGraphMap(self, key, hashtagObject): def getSourceLattice(occ): sortedOcc = occ[:int(PERCENTAGE_OF_EARLY_LIDS_TO_DETERMINE_SOURCE_LATTICE*len(occ))] if sortedOcc: return max([(lid, len(list(l))) for lid, l in groupby(sorted([t[0] for t in sortedOcc]))], key=lambda t: t[1]) occuranesInHighestActiveRegion, latticesToOccranceTimeMap = getOccuranesInHighestActiveRegion(hashtagObject), {} validLattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys() occuranesInHighestActiveRegion = [(getLatticeLid(k, ACCURACY), v) for k, v in occuranesInHighestActiveRegion if getLatticeLid(k, ACCURACY) in validLattices] if occuranesInHighestActiveRegion: sourceLattice = getSourceLattice(occuranesInHighestActiveRegion) if sourceLattice: sourceLattice = sourceLattice[0] for lid, v in occuranesInHighestActiveRegion: if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v latticesOccranceTimeList = latticesToOccranceTimeMap.items() hastagStartTime, hastagEndTime = latticesToOccranceTimeMap[sourceLattice], max(latticesOccranceTimeList, key=itemgetter(1))[1] hashtagTimePeriod = hastagEndTime - hastagStartTime if hashtagTimePeriod: latticesOccranceTimeList = [(t[0], temporalScore(t[1]-hastagStartTime, hashtagTimePeriod)) for t in latticesOccranceTimeList if t[1]>hastagStartTime] for lattice, score in latticesOccranceTimeList: if score>=MIN_TEMPORAL_CLOSENESS_SCORE_FOR_IN_OUT_LINKS: yield sourceLattice, [hashtagObject['h'], 'out_link', [lattice, score]] yield lattice, [hashtagObject['h'], 'in_link', [sourceLattice, score]]
def plotHashtagsInOutGraphs(timeRange, outputFolder): def plotPoints(links, xlabel): cm = matplotlib.cm.get_cmap('cool') points, colors = zip(*sorted([(getLocationFromLid(k.replace('_', ' ')), v)for k, v in links.iteritems()], key=itemgetter(1))) sc = plotPointsOnWorldMap(points, c=colors, cmap=cm, lw=0, vmin=0, vmax=1) plotPointsOnWorldMap([getLocationFromLid(locationObject['id'].replace('_', ' '))], c='k', s=20, lw=0) plt.xlabel(xlabel), plt.colorbar(sc) counter=1 for locationObject in FileIO.iterateJsonFromFile(hashtagLocationInAndOutTemporalClosenessGraphFile%(outputFolder, '%s_%s'%timeRange)): point = getLocationFromLid(locationObject['id'].replace('_', ' ')) outputFile = hashtagsImagesLocationInfluencersFolder+'%s.png'%getLatticeLid([point[1], point[0]], ACCURACY); FileIO.createDirectoryForFile(outputFile) print counter;counter+=1 if not os.path.exists(outputFile): if locationObject['in_link'] and locationObject['out_link']: print outputFile plt.subplot(211) plt.title(locationObject['id']) plotPoints(locationObject['in_link'], 'Gets hashtags from these locations') plt.subplot(212) plotPoints(locationObject['out_link'], 'Sends hashtags to these locations') # plt.show() plt.savefig(outputFile); plt.clf()
def plotHashtagFlowInTimeForWindowOfNLocations(hashTagObject): currentIndex, previousIndex, startingEpoch = 0, 0, None if not os.path.exists(hashtagsImagesFlowInTimeForWindowOfNLocationsFolder%hashTagObject['h']): validTimeUnits, latticesToOccranceMap = getValidTimeUnits(hashTagObject['oc']), defaultdict(list) fileNameIterator = getFileName() for l, t in hashTagObject['oc']: latticesToOccranceMap[getLatticeLid(l, ACCURACY)].append((l,t)) for k in latticesToOccranceMap.keys()[:]: validOccurences = getOccurencesFilteredByDistributionInTimeUnits(latticesToOccranceMap[k], validTimeUnits) if validOccurences: latticesToOccranceMap[k] = validOccurences else: del latticesToOccranceMap[k] latticesSortedByTime = sorted([(k, min(zip(*v)[1])) for k, v in latticesToOccranceMap.iteritems()], key=itemgetter(1)) while currentIndex<len(latticesSortedByTime): try: outputFile = hashtagsImagesFlowInTimeForWindowOfNLocationsFolder%hashTagObject['h']+fileNameIterator.next(); createDirectoryForFile(outputFile) print currentIndex, hashTagObject['h'], outputFile currentIndex+=LOCATION_WINDOW_SIZE if currentIndex>len(latticesSortedByTime): currentIndex=len(latticesSortedByTime) occurences = [] for l in latticesSortedByTime[previousIndex:currentIndex]: occurences+=latticesToOccranceMap[l[0]] startingEpoch = plotDistributionGraphs(occurences, validTimeUnits, '%s - Interval (%d - %d) of %d'%(hashTagObject['h'], previousIndex+1, currentIndex, len(latticesSortedByTime)), startingEpoch) plt.show() # plt.savefig(outputFile); plt.clf() previousIndex=currentIndex except: break
def mapHashtagObjectsToLocationUnits(self, key, hashtagObject): if False: yield # I'm a generator! hashtag = hashtagObject['h'] for point, t in hashtagObject['oc']: self.locations[getLatticeLid(point, LOCATION_ACCURACY)].append([hashtagObject['h'], t])
def filterLatticesByMinHashtagOccurencesPerLattice(h): latticesToOccurancesMap = defaultdict(list) for l, oc in h['oc']:latticesToOccurancesMap[getLatticeLid(l, ACCURACY)].append(oc) return dict([(k,v) for k, v in latticesToOccurancesMap.iteritems() if len(v)>=MIN_HASHTAG_OCCURENCES_PER_LATTICE])
def getHashtagDistributionInLattice(self, key, hashtagObject): distribution = defaultdict(int) for l, _ in hashtagObject['oc']: distribution[getLatticeLid(l, accuracy=ACCURACY)]+=1 yield key, {'h':hashtagObject['h'], 't': hashtagObject['t'], 'd': distribution.items()}
def plot_maps_for_every_minute(): MINUTES = 1 hashtags = ['ripstevejobs'] map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag = defaultdict(dict) for hashtag in hashtags: for hashtag_object in FileIO.iterateJsonFromFile('./data/%s.json'%hashtag): map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtag_object), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False) tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time = sorted(map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time.iteritems(), key=itemgetter(0)) epoch_starting_time_unit = tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time[0][0] epoch_ending_time_unit = epoch_starting_time_unit+1*60*60 for epoch_time_unit, tuples_of_location_and_epoch_occurrence_time in tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time: if epoch_time_unit<=epoch_ending_time_unit: if tuples_of_location_and_epoch_occurrence_time: epoch_lag = epoch_time_unit - epoch_starting_time_unit tuples_of_location_and_epoch_occurrence_time = sorted(tuples_of_location_and_epoch_occurrence_time, key=itemgetter(1)) map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag][hashtag] = [(getLatticeLid(location, 0.145), epoch_occurrence_time-epoch_starting_time_unit)for location, epoch_occurrence_time in tuples_of_location_and_epoch_occurrence_time] map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag = defaultdict(list) GeneralMethods.runCommand('rm -rf ./images/plot_maps_for_every_minute/') for epoch_lag in sorted(map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag): file_world_map_plot = './images/plot_maps_for_every_minute/%s.png'%(epoch_lag) print file_world_map_plot map_from_hashtag_to_tuples_of_location_and_epoch_lag = map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag] for hashtag, tuples_of_location_and_epoch_lag in map_from_hashtag_to_tuples_of_location_and_epoch_lag.iteritems(): map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag[hashtag]+=tuples_of_location_and_epoch_lag for hashtag, accumulated_tuples_of_location_and_epoch_lag in map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag.iteritems(): tuples_of_location_and_epoch_max_lag= [(location, max(zip(*iterator_of_tuples_of_location_and_epoch_lag)[1])) for location, iterator_of_tuples_of_location_and_epoch_lag in groupby(sorted(accumulated_tuples_of_location_and_epoch_lag, key=itemgetter(0)), key=itemgetter(0)) ] locations, colors = zip(*[(getLocationFromLid(location.replace('_', ' ')), (epoch_lag+MINUTES*60)-epoch_max_lag) for location, epoch_max_lag in sorted(tuples_of_location_and_epoch_max_lag, key=itemgetter(1), reverse=True)]) plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, cmap=matplotlib.cm.cool, lw = 0, vmax=epoch_lag+MINUTES*60) plt.title('%s (%s minutes)'%(hashtag, (epoch_lag+MINUTES*60)/(60.))) # plt.show() FileIO.createDirectoryForFile(file_world_map_plot) plt.savefig(file_world_map_plot) plt.clf()
def temp(): hashtags, MINUTES = [], 60 for hashtagObject in FileIO.iterateJsonFromFile('americanhorrorstory'): if hashtagObject['h']=='americanhorrorstory': print unicode(hashtagObject['h']).encode('utf-8'), len(hashtagObject['oc']) occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject, timeUnit=60*60), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False) totalOccurances = [] for interval, t in enumerate(sorted(occsDistributionInTimeUnits)): occs = occsDistributionInTimeUnits[t] if occs: fileName = '../images/plotsOnMap/%s/%s.png'%(hashtagObject['h'], (interval+1)*MINUTES); FileIO.createDirectoryForFile(fileName) # print interval, t, len(occs) print fileName occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), 'm') for lid, occ in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in occs], key=itemgetter(0)), key=itemgetter(0))] occurancesGroupedByLattice = sorted(occurancesGroupedByLattice, key=itemgetter(1)) points, colors = zip(*occurancesGroupedByLattice) plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0) # plt.show() plt.savefig(fileName) plt.clf() exit()
def plotDistributionGraphs(occurences, validTimeUnits, title, startingEpoch=None): occurences = getOccurencesFilteredByDistributionInTimeUnits(occurences, validTimeUnits) occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), sorted(zip(*occs)[1])) for lid, occs in groupby(sorted([(getLatticeLid(l, ACCURACY), t) for l, t in occurences], key=itemgetter(0)), key=itemgetter(0))] plt.subplot(211) pointsForNumberOfOccurances, numberOfOccurancesList = zip(*sorted(occurancesGroupedByLattice, key=lambda t: len(t[1]))) numberOfOccurancesList = [len(ocs) for ocs in numberOfOccurancesList] cm = matplotlib.cm.get_cmap('cool') sc = plotPointsOnWorldMap(pointsForNumberOfOccurances, c=numberOfOccurancesList, cmap=cm, lw = 0, alpha=1.0) plt.colorbar(sc), plt.title(title), plt.xlabel('Number of mentions') plt.subplot(212) pointsForNumberOfOccurances, occuranceTime = zip(*sorted(occurancesGroupedByLattice, key=lambda t: min(t[1]), reverse=True)) occuranceTime=[min(t) for t in occuranceTime] if not startingEpoch: startingEpoch = occuranceTime[-1] occuranceTime=[(t-startingEpoch)/TIME_UNIT_IN_SECONDS for t in occuranceTime] cm = matplotlib.cm.get_cmap('autumn') sc = plotPointsOnWorldMap(pointsForNumberOfOccurances, c=occuranceTime, cmap=cm, lw = 0, alpha=1.0) plt.colorbar(sc), plt.xlabel('Speed of hashtag arrival') return startingEpoch
def map_rawData_to_reducedlatticeObjectUnits(self, key, line): data = getCheckinObject(line) data['u'] = data['user']['id'] for k in ['tx', 'user']: del data[k] yield getLatticeLid(data['l'], accuracy=ACCURACY), data
def map_rawData_to_latticeObjectUnits(self, key, line): data = getCheckinObject(line) yield getLatticeLid(data['l'], accuracy=ACCURACY), data
def plotHastagClasses(timeRange, folderType): def getFileName(): for i in combinations('abcedfghijklmnopqrstuvwxyz',2): yield ''.join(i)+'.png' count=1 # for hashtagObject in FileIO.iterateJsonFromFile(hashtagsWithoutEndingWindowFile%(folderType,'%s_%s'%timeRange)): for hashtagObject in FileIO.iterateJsonFromFile(hashtagsFile%('testing_world','%s_%s'%(2,11))): # HashtagsClassifier.classify(hashtagObject) print count; count+=1 # if hashtagObject['h']=='ripamy': classId = HashtagsClassifier.classify(hashtagObject) if classId!=None: classId = 1 outputFile = hashtagsImagesHashtagsClassFolder%folderType+'%s/%s.png'%(classId, hashtagObject['h']); FileIO.createDirectoryForFile(outputFile) fileNameIterator = getFileName() timeUnits, timeSeries = getTimeUnitsAndTimeSeries(hashtagObject['oc'], timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS) occurancesInActivityRegions = [[getOccuranesInHighestActiveRegion(hashtagObject), 'm']] # for hashtagPropagatingRegion in HashtagsClassifier._getActivityRegionsWithActivityAboveThreshold(hashtagObject): # validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)] # occurancesInActiveRegion = [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS) in validTimeUnits] # occurancesInActivityRegions.append([occurancesInActiveRegion, GeneralMethods.getRandomColor()]) currentMainRangeId = 0 for occurances1, color1 in occurancesInActivityRegions: # outputFile=outputFolder+fileNameIterator.next();FileIO.createDirectoryForFile(outputFile) print outputFile ax = plt.subplot(312) subRangeId = 0 for occurances, color in occurancesInActivityRegions: if subRangeId==currentMainRangeId: color='m' timeUnits, timeSeries = getTimeUnitsAndTimeSeries(occurances, timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS) # if len(timeUnits)<24: # difference = 24-len(timeUnits) # timeUnits=list(timeUnits)+[timeUnits[-1]+(i+1)*HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS for i in range(difference)] # timeSeries=list(timeSeries)+[0 for i in range(difference)] # print len(timeUnits[:24]), len(timeSeries[:24]) plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-o', c=color) subRangeId+=1 # plt.ylim(ymax=1) plt.setp(ax.get_xticklabels(), rotation=10, fontsize=7) ax=plt.subplot(313) subRangeId = 0 timeUnits, timeSeries = getTimeUnitsAndTimeSeries(hashtagObject['oc'], timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS) plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-') for occurances, color in occurancesInActivityRegions: if subRangeId==currentMainRangeId: color='m' timeUnits, timeSeries = getTimeUnitsAndTimeSeries(occurances, timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS) plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-o', c=color) subRangeId+=1 plt.setp(ax.get_xticklabels(), rotation=10, fontsize=7) plt.subplot(311) occurancesGroupedByLattice = sorted( [(getLocationFromLid(lid.replace('_', ' ')), len(list(occs))) for lid, occs in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in occurances1], key=itemgetter(0)), key=itemgetter(0))], key=itemgetter(1) ) points, colors = zip(*occurancesGroupedByLattice) cm = matplotlib.cm.get_cmap('cool') if len(points)>1: sc = plotPointsOnWorldMap(points, c=colors, cmap=cm, lw=0, alpha=1.0) plt.colorbar(sc) else: sc = plotPointsOnWorldMap(points, c='m', lw=0) plt.title(hashtagObject['h']+ '(%d)'%len(occurancesGroupedByLattice)) # plt.show() try: plt.savefig(outputFile); plt.clf() except: pass currentMainRangeId+=1
def plotGraphsForHashtag(hashtag): for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'): MINUTES = 5 if hashtagObject['h']==hashtag: print unicode(hashtagObject['h']).encode('utf-8'), len(hashtagObject['oc']) occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False) totalOccurances = [] for interval, t in enumerate(sorted(occsDistributionInTimeUnits)): occs = occsDistributionInTimeUnits[t] totalOccurances+=occs if occs: fileName = '../images/plotsOnMap/%s/%s.png'%(hashtagObject['h'], (interval+1)*MINUTES); FileIO.createDirectoryForFile(fileName) print fileName occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), 'm') for lid, occs in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in totalOccurances], key=itemgetter(0)), key=itemgetter(0))] occurancesGroupedByLattice = sorted(occurancesGroupedByLattice, key=itemgetter(1)) points, colors = zip(*occurancesGroupedByLattice) plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0) plt.show() # plt.savefig(fileName) plt.clf() if (interval+1)*MINUTES>=120: break break
def mapper_hashtag_object_to_tuo_location_and_tuo_hashtag_and_occurrence_time(self, key, hashtag_object): if False: yield # I'm a generator! for point, t in hashtag_object["oc"]: location = getLatticeLid(point, LOCATION_ACCURACY) self.mf_location_to_tuo_hashtag_and_occurrence_time[location].append([hashtag_object["h"], t])
def get_tuo_hashtag_and_ltuo_occurrence_time_and_locations(mf_hashtag_to_ltuo_point_and_occurrence_time, top_hashtags): hashtag_and_ltuo_occurrence_time_and_locations = \ [( top_hashtag.split()[0], [ (GeneralMethods.approximateEpoch(occurrence_time, UNIT_TIME_UNIT_IN_SECONDS), getLatticeLid(point, UNIT_LATTICE_ACCURACY)) for point, occurrence_time in mf_hashtag_to_ltuo_point_and_occurrence_time[top_hashtag.split()[0]] ] ) for top_hashtag in top_hashtags ] tuo_hashtag_and_ltuo_occurrence_time_and_locations = [] for hashtag, ltuo_occurrence_time_and_locations in hashtag_and_ltuo_occurrence_time_and_locations: ltuo_occurrence_time_and_locations = [(occurrence_time, zip(*ito_ltuo_occurrence_time_and_locations)[1]) for occurrence_time, ito_ltuo_occurrence_time_and_locations in groupby( sorted(ltuo_occurrence_time_and_locations, key=itemgetter(0)), key=itemgetter(0) ) ] tuo_hashtag_and_ltuo_occurrence_time_and_locations.append((hashtag, ltuo_occurrence_time_and_locations)) return tuo_hashtag_and_ltuo_occurrence_time_and_locations
def get_larger_lid(lid): return getLatticeLid(getLocationFromLid(lid.replace('_', ' ')), 10) for model_id in model_ids:
def buildLatticeGraphMap(self, key, hashtagObject): def getOccurranceDistributionInEpochs(occ, timeUnit=TIME_UNIT_IN_SECONDS, fillInGaps=False, occurancesCount=True): if occurancesCount: occurranceDistributionInEpochs = filter(lambda t:t[1]>2, [(k[0], len(list(k[1]))) for k in groupby(sorted([GeneralMethods.approximateEpoch(t, timeUnit) for t in zip(*occ)[1]]))]) else: occurranceDistributionInEpochs = filter(lambda t:len(t[1])>2, [(k[0], [t[1] for t in k[1]]) for k in groupby(sorted([(GeneralMethods.approximateEpoch(t[1], timeUnit), t) for t in occ], key=itemgetter(0)), key=itemgetter(0))]) if not fillInGaps: return occurranceDistributionInEpochs else: if occurranceDistributionInEpochs: startEpoch, endEpoch = min(occurranceDistributionInEpochs, key=itemgetter(0))[0], max(occurranceDistributionInEpochs, key=itemgetter(0))[0] # if not occurancesCount: startEpoch, endEpoch = startEpoch[0], endEpoch[0] dataX = range(startEpoch, endEpoch, timeUnit) occurranceDistributionInEpochs = dict(occurranceDistributionInEpochs) for x in dataX: if x not in occurranceDistributionInEpochs: if occurancesCount: occurranceDistributionInEpochs[x]=0 else: occurranceDistributionInEpochs[x]=[] return occurranceDistributionInEpochs else: return dict(occurranceDistributionInEpochs) def getActiveRegions(timeSeries): noOfZerosObserved, activeRegions = 0, [] currentRegion, occurancesForRegion = None, 0 for index, l in zip(range(len(timeSeries)),timeSeries): if l>0: if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION or index==0: currentRegion = [None, None, None] currentRegion[0] = index occurancesForRegion = 0 noOfZerosObserved = 0 occurancesForRegion+=l else: noOfZerosObserved+=1 if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION and currentRegion and currentRegion[1]==None: currentRegion[1] = index-MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION-1 currentRegion[2] = occurancesForRegion activeRegions.append(currentRegion) if not activeRegions: activeRegions.append([0, len(timeSeries)-1, sum(timeSeries)]) else: currentRegion[1], currentRegion[2] = index, occurancesForRegion activeRegions.append(currentRegion) return activeRegions def getOccuranesInHighestActiveRegion(hashtagObject, checkIfItFirstActiveRegion=False, timeUnit=TIME_UNIT_IN_SECONDS, maxLengthOfHighestActiveRegion=None): occurancesInActiveRegion, timeUnits = [], [] occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject['oc'], fillInGaps=True) if occurranceDistributionInEpochs: timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0))) hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2)) if not maxLengthOfHighestActiveRegion: validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)] else: validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)][:maxLengthOfHighestActiveRegion] occurancesInActiveRegion = [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, timeUnit) in validTimeUnits] if not checkIfItFirstActiveRegion: return occurancesInActiveRegion else: isFirstActiveRegion=False if timeUnits and timeUnits[0]==validTimeUnits[0]: isFirstActiveRegion=True return (occurancesInActiveRegion, isFirstActiveRegion) def filterLatticesByMinHashtagOccurencesPerLattice(h): latticesToOccurancesMap = defaultdict(list) for l, oc in h['oc']: lid = getLatticeLid(l, LOCATION_ACCURACY) if lid!='0.0000_0.0000': latticesToOccurancesMap[lid].append(oc) return dict([(k,v) for k, v in latticesToOccurancesMap.iteritems() if len(v)>=MIN_HASHTAG_OCCURENCES_PER_LATTICE]) hashtagObject['oc']=getOccuranesInHighestActiveRegion(hashtagObject) lattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys() # latticesToOccranceTimeMap = {} # for k, v in hashtagObject['oc']: # lid = getLatticeLid(k, LOCATION_ACCURACY) # if lid!='0.0000_0.0000' and lid in lattices: # if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v ### latticesToOccranceTimeMap = defaultdict(list) for k, v in hashtagObject['oc']: lid = getLatticeLid(k, LOCATION_ACCURACY) if lid!='0.0000_0.0000' and lid in lattices: latticesToOccranceTimeMap[lid].append(v) ### lattices = latticesToOccranceTimeMap.items() if lattices: # hastagStartTime, hastagEndTime = min(lattices, key=lambda (lid, occurrences): min(occurrences) )[1], max(lattices, key=lambda (lid, occurrences): max(occurrences) )[1] # hastagStartTime, hastagEndTime = min(hastagStartTime), max(hastagEndTime) # hashtagTimePeriod = hastagEndTime - hastagStartTime hashtagTimePeriod = None for lattice in lattices: yield lattice[0], ['h', [[hashtagObject['h'], [lattice[1], hashtagTimePeriod]]]] yield lattice[0], ['n', lattices]