def mapper(self, key, line):
     data = parseData(line)
     if data and isWithinBoundingBox(data['l'], boundary): 
         del data['_id']
         data['t'] = time.mktime(data['t'].timetuple())
         data['lid'] = getLidFromLocation(data['l'])
         data['llid'] = getLatticeLid(data['l'], accuracy=0.015)
         yield data, 1
Example #2
0
def iterate_hashtag_occurrences_with_high_accuracy_lid(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    lid = getLatticeLid(l, accuracy=0.0001)
    for h in data['h']: yield h.lower(), [lid, GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS)]
Example #3
0
    def get_user_from_checkins(self, key, line):
        data, hashtags = getCheckinObject(line), defaultdict(int)
#        if isWithinBoundingBox(data['l'], BOUNDARY): 
        data['u'] = data['user']['id']
        for k in ['tx', 'user']: del data[k] 
        data['llid'] = getLatticeLid(data['l'], accuracy=ACCURACY)
        for h in data['h']: hashtags[h.lower()]+=1
        data['h'] = hashtags
        yield data['u'], data
Example #4
0
 def filterLatticesByMinHashtagOccurencesPerLattice(h):
     latticesToOccurancesMap = defaultdict(list)
     for l, oc in h["oc"]:
         lid = getLatticeLid(l, LOCATION_ACCURACY)
         if lid != "0.0000_0.0000":
             latticesToOccurancesMap[lid].append(oc)
     return dict(
         [(k, v) for k, v in latticesToOccurancesMap.iteritems() if len(v) >= MIN_HASHTAG_OCCURENCES_PER_LATTICE]
     )
Example #5
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if "geo" in data:
        l = data["geo"]
    else:
        l = data["bb"]
    point = getLatticeLid(l, LATTICE_ACCURACY)
    if point == "0.0000_0.0000":
        yield point, l
Example #6
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t =  GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS)
    if isWithinBoundingBox(l, BOUNDARY):
        point = getLatticeLid(l, LATTICE_ACCURACY)
        if point!='0.0000_0.0000':
            for h in data['h']: yield h.lower(), [point, t]
Example #7
0
 def load_ltuo_hashtag_and_ltuo_location_and_occurrence_time(startTime=datetime(2012, 1, 1), endTime=datetime(2012, 3, 31), outputFolder='complete_prop'):
     ltuo_hashtag_and_ltuo_location_and_occurrence_time = []
     for hashtag_object in \
             iterateJsonFromFile(f_hashtag_objects%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))):
         ltuo_location_and_occurrence_time = [
                                              (getLatticeLid(point, LOCATION_ACCURACY), occurrence_time)
                                              for point, occurrence_time in hashtag_object['oc']
                                              ]
         ltuo_hashtag_and_ltuo_location_and_occurrence_time.append([hashtag_object['h'], ltuo_location_and_occurrence_time])
     return ltuo_hashtag_and_ltuo_location_and_occurrence_time
Example #8
0
def iterateHashtagObjectInstances(line, all_locations = False):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t = time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple())
    point = getLattice(l, LOCATION_ACCURACY)
    if not all_locations:
        lattice_lid = getLatticeLid(point, LOCATION_ACCURACY)
        if lattice_lid in VALID_LOCATIONS_LIST:
            for h in data['h']: yield h.lower(), [point, t]
    else:
        for h in data['h']: yield h.lower(), [point, t]
Example #9
0
 def plotSharingProbabilityAndTemporalClosenessScoresOnMap(timeRange, outputFolder):
     i = 1
     for latticeObject in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%(outputFolder,'%s_%s'%timeRange)):
         latticePoint = getLocationFromLid(latticeObject['id'].replace('_', ' '))
         latticeId = getLatticeLid([latticePoint[1], latticePoint[0]], LATTICE_ACCURACY)
         plt.subplot(211)
         plt.title(latticeId)
         LatticeGraphPlots.plotLatticeSharingProbabilityOnMap(LatticeGraph.typeSharingProbability, latticeObject)
         plt.subplot(212)
         LatticeGraphPlots.plotLatticeTemporalClosenessScoresOnMap(LatticeGraph.typeTemporalCloseness, latticeObject)
         plt.show()
         outputFile = hashtagsImagesGraphAnalysisFolder%outputFolder+'%s_and_%s/%s.png'%(LatticeGraph.typeSharingProbability['id'], LatticeGraph.typeTemporalCloseness['id'], latticeId); FileIO.createDirectoryForFile(outputFile)
         print i, outputFile; i+=1
 def buildLatticeGraphMap(self, key, hashtagObject):
     hashtagObject['oc']=getOccuranesInHighestActiveRegion(hashtagObject)
     lattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys()
     latticesToOccranceTimeMap = {}
     for k, v in hashtagObject['oc']:
         lid = getLatticeLid(k, LATTICE_ACCURACY)
         if lid!='0.0000_0.0000' and lid in lattices:
             if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v
     lattices = latticesToOccranceTimeMap.items()
     if lattices:
         hastagStartTime, hastagEndTime = min(lattices, key=itemgetter(1))[1], max(lattices, key=itemgetter(1))[1]
         hashtagTimePeriod = hastagEndTime - hastagStartTime
         for lattice in lattices: 
             yield lattice[0], ['h', [[hashtagObject['h'], [lattice[1], hashtagTimePeriod]]]]
             yield lattice[0], ['n', lattices]
Example #11
0
 def buildLocationTemporalClosenessGraphMap(self, key, hashtagObject):
     occuranesInHighestActiveRegion, latticesToOccranceTimeMap = getOccuranesInHighestActiveRegion(hashtagObject), {}
     hashtagObject['oc']=occuranesInHighestActiveRegion
     validLattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys()
     for k, v in occuranesInHighestActiveRegion:
         lid = getLatticeLid(k, ACCURACY)
         if lid in validLattices:
             if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v
     if latticesToOccranceTimeMap:
         latticesOccranceTimeList = latticesToOccranceTimeMap.items()
         hastagStartTime, hastagEndTime = min(latticesOccranceTimeList, key=itemgetter(1))[1], max(latticesOccranceTimeList, key=itemgetter(1))[1]
         hashtagTimePeriod = hastagEndTime - hastagStartTime
         if hashtagTimePeriod:
             for l1, l2 in combinations(latticesOccranceTimeList, 2):
                 score = temporalScore(np.abs(l1[1]-l2[1]),hashtagTimePeriod)
                 if score>=MIN_TEMPORAL_CLOSENESS_SCORE:
                     yield l1[0], [hashtagObject['h'], [l2[0], score]]
                     yield l2[0], [hashtagObject['h'], [l1[0], score]]
Example #12
0
 def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy):
     occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict)
     for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc)
     higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True)
     for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' ')))
     for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1))
     occurancesToReturn = []
     currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances))
     for hl, occs  in higherLattices: 
         higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl}
         while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5:
             (l, d) = distanceMatrix[hl][0]; 
             distanceMatrix[hl]=distanceMatrix[hl][1:]
             higherLatticeSet['distance']+=d
             higherLatticeSet['lattices'].append(l)
             higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l])
         if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet
     for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l]
 #    return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
     return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
Example #13
0
def getLocalityIndexAtK(occurances, kValue):
    ''' Locality index at k - for a hashtag is the minimum radius that covers k percentage of occurrances.
            A high locality index suggests hashtag was global with a small index suggests it was local.
        To find locality index at k, I must find a point that is closest to k percentage of occurances. 
            Brute force requires nC2 complexity. 
            Hence, use lattices of bigger size technique.
    '''
    def getLatticeThatGivesMinimumLocalityIndexAtK():
        occurancesDict = {'occurances': occurances}
        for accuracy in [4, 2, 1, 0.5, ACCURACY]: occurancesDict = getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurancesDict['occurances'], accuracy)
        return occurancesDict['sourceLattice']
    def getLatticeThatGivesMinimumLocalityIndexAtKForAccuracy(occurances, accuracy):
        occurancesDistributionInHigherLattice, distanceMatrix = defaultdict(list), defaultdict(dict)
        for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, accuracy)].append(oc)
        higherLattices = sorted(occurancesDistributionInHigherLattice.iteritems(), key=lambda t: len(t[1]), reverse=True)
        for hl1, hl2 in combinations(occurancesDistributionInHigherLattice, 2): distanceMatrix[hl1][hl2] = distanceMatrix[hl2][hl1] = getHaversineDistance(getLocationFromLid(hl1.replace('_', ' ')), getLocationFromLid(hl2.replace('_', ' ')))
        for k,v in distanceMatrix.iteritems(): distanceMatrix[k] = sorted(v.iteritems(), key=itemgetter(1))
        occurancesToReturn = []
        currentHigherLatticeSet, totalOccurances = {'distance': ()}, float(len(occurances))
        for hl, occs  in higherLattices: 
            higherLatticeSet = {'distance': 0, 'observedOccurances': len(occs), 'lattices': [hl], 'sourceLattice': hl}
            while currentHigherLatticeSet['distance']>higherLatticeSet['distance'] and higherLatticeSet['observedOccurances']/totalOccurances<0.5:
                (l, d) = distanceMatrix[hl][0]; 
                distanceMatrix[hl]=distanceMatrix[hl][1:]
                higherLatticeSet['distance']+=d
                higherLatticeSet['lattices'].append(l)
                higherLatticeSet['observedOccurances']+=len(occurancesDistributionInHigherLattice[l])
            if currentHigherLatticeSet==None or currentHigherLatticeSet['distance']>higherLatticeSet['distance']: currentHigherLatticeSet=higherLatticeSet
        for l in currentHigherLatticeSet['lattices']: occurancesToReturn+=occurancesDistributionInHigherLattice[l]
    #    return {'distance': currentHigherLatticeSet['distance'], 'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
        return {'occurances': occurancesToReturn, 'sourceLattice': getLocationFromLid(currentHigherLatticeSet['sourceLattice'].replace('_', ' '))}
    occurancesDistributionInHigherLattice = defaultdict(int)
    for oc in occurances: occurancesDistributionInHigherLattice[getLatticeLid(oc, ACCURACY)]+=1
    totalOccurances, distance, observedOccuraces = float(len(occurances)), 0, 0
    lattice = getLatticeThatGivesMinimumLocalityIndexAtK()
    sortedLatticeObjects = sorted([(getLocationFromLid(k.replace('_', ' ')), getHaversineDistance(lattice, getLocationFromLid(k.replace('_', ' '))), v) for k, v in occurancesDistributionInHigherLattice.iteritems()],
                 key=itemgetter(1))
    for l, d, oc in sortedLatticeObjects:
        distance=d; observedOccuraces+=oc
        if observedOccuraces/totalOccurances>=kValue: break
    return (d, lattice)
Example #14
0
 def buildLocationInAndOutTemporalClosenessGraphMap(self, key, hashtagObject):
     def getSourceLattice(occ):
         sortedOcc = occ[:int(PERCENTAGE_OF_EARLY_LIDS_TO_DETERMINE_SOURCE_LATTICE*len(occ))]
         if sortedOcc: return max([(lid, len(list(l))) for lid, l in groupby(sorted([t[0] for t in sortedOcc]))], key=lambda t: t[1])
     occuranesInHighestActiveRegion, latticesToOccranceTimeMap = getOccuranesInHighestActiveRegion(hashtagObject), {}
     validLattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys()
     occuranesInHighestActiveRegion = [(getLatticeLid(k, ACCURACY), v) for k, v in occuranesInHighestActiveRegion if getLatticeLid(k, ACCURACY) in validLattices]
     if occuranesInHighestActiveRegion:
         sourceLattice = getSourceLattice(occuranesInHighestActiveRegion)
         if sourceLattice:
             sourceLattice = sourceLattice[0]
             for lid, v in occuranesInHighestActiveRegion:
                 if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v
             latticesOccranceTimeList = latticesToOccranceTimeMap.items()
             hastagStartTime, hastagEndTime = latticesToOccranceTimeMap[sourceLattice], max(latticesOccranceTimeList, key=itemgetter(1))[1]
             hashtagTimePeriod = hastagEndTime - hastagStartTime
             if hashtagTimePeriod:
                 latticesOccranceTimeList = [(t[0], temporalScore(t[1]-hastagStartTime, hashtagTimePeriod)) for t in latticesOccranceTimeList if t[1]>hastagStartTime]
                 for lattice, score in latticesOccranceTimeList:
                     if score>=MIN_TEMPORAL_CLOSENESS_SCORE_FOR_IN_OUT_LINKS:
                         yield sourceLattice, [hashtagObject['h'], 'out_link', [lattice, score]]
                         yield lattice, [hashtagObject['h'], 'in_link', [sourceLattice, score]]
Example #15
0
def plotHashtagsInOutGraphs(timeRange, outputFolder):
    def plotPoints(links, xlabel):
        cm = matplotlib.cm.get_cmap('cool')
        points, colors = zip(*sorted([(getLocationFromLid(k.replace('_', ' ')), v)for k, v in links.iteritems()], key=itemgetter(1)))
        sc = plotPointsOnWorldMap(points, c=colors, cmap=cm, lw=0, vmin=0, vmax=1)
        plotPointsOnWorldMap([getLocationFromLid(locationObject['id'].replace('_', ' '))], c='k', s=20, lw=0)
        plt.xlabel(xlabel), plt.colorbar(sc)
    counter=1
    for locationObject in FileIO.iterateJsonFromFile(hashtagLocationInAndOutTemporalClosenessGraphFile%(outputFolder, '%s_%s'%timeRange)): 
        point = getLocationFromLid(locationObject['id'].replace('_', ' '))
        outputFile = hashtagsImagesLocationInfluencersFolder+'%s.png'%getLatticeLid([point[1], point[0]], ACCURACY); FileIO.createDirectoryForFile(outputFile)
        print counter;counter+=1
        if not os.path.exists(outputFile):
            if locationObject['in_link'] and locationObject['out_link']:
                print outputFile
                plt.subplot(211)
                plt.title(locationObject['id'])
                plotPoints(locationObject['in_link'], 'Gets hashtags from these locations')
                plt.subplot(212)
                plotPoints(locationObject['out_link'], 'Sends hashtags to these locations')
#                plt.show()
                plt.savefig(outputFile); plt.clf()
def plotHashtagFlowInTimeForWindowOfNLocations(hashTagObject):
    currentIndex, previousIndex, startingEpoch = 0, 0, None
    if not os.path.exists(hashtagsImagesFlowInTimeForWindowOfNLocationsFolder%hashTagObject['h']):
        validTimeUnits, latticesToOccranceMap = getValidTimeUnits(hashTagObject['oc']), defaultdict(list)
        fileNameIterator = getFileName()
        for l, t in hashTagObject['oc']: latticesToOccranceMap[getLatticeLid(l, ACCURACY)].append((l,t))
        for k in latticesToOccranceMap.keys()[:]: 
            validOccurences = getOccurencesFilteredByDistributionInTimeUnits(latticesToOccranceMap[k], validTimeUnits)
            if validOccurences: latticesToOccranceMap[k] =  validOccurences
            else: del latticesToOccranceMap[k]
        latticesSortedByTime = sorted([(k, min(zip(*v)[1])) for k, v in latticesToOccranceMap.iteritems()], key=itemgetter(1))
        while currentIndex<len(latticesSortedByTime):
            try:
                outputFile = hashtagsImagesFlowInTimeForWindowOfNLocationsFolder%hashTagObject['h']+fileNameIterator.next(); createDirectoryForFile(outputFile)
                print currentIndex, hashTagObject['h'], outputFile
                currentIndex+=LOCATION_WINDOW_SIZE
                if currentIndex>len(latticesSortedByTime): currentIndex=len(latticesSortedByTime)
                occurences = []
                for l in latticesSortedByTime[previousIndex:currentIndex]: occurences+=latticesToOccranceMap[l[0]]
                startingEpoch = plotDistributionGraphs(occurences, validTimeUnits, '%s - Interval (%d - %d) of %d'%(hashTagObject['h'], previousIndex+1, currentIndex, len(latticesSortedByTime)), startingEpoch)
                plt.show()
#                plt.savefig(outputFile); plt.clf()
                previousIndex=currentIndex
            except: break
Example #17
0
 def mapHashtagObjectsToLocationUnits(self, key, hashtagObject):
     if False: yield # I'm a generator!
     hashtag = hashtagObject['h']
     for point, t in hashtagObject['oc']: 
         self.locations[getLatticeLid(point, LOCATION_ACCURACY)].append([hashtagObject['h'], t])
Example #18
0
def filterLatticesByMinHashtagOccurencesPerLattice(h):
    latticesToOccurancesMap = defaultdict(list)
    for l, oc in h['oc']:latticesToOccurancesMap[getLatticeLid(l, ACCURACY)].append(oc)
    return dict([(k,v) for k, v in latticesToOccurancesMap.iteritems() if len(v)>=MIN_HASHTAG_OCCURENCES_PER_LATTICE])
Example #19
0
 def getHashtagDistributionInLattice(self,  key, hashtagObject):
     distribution = defaultdict(int)
     for l, _ in hashtagObject['oc']: distribution[getLatticeLid(l, accuracy=ACCURACY)]+=1
     yield key, {'h':hashtagObject['h'], 't': hashtagObject['t'], 'd': distribution.items()}
Example #20
0
 def plot_maps_for_every_minute():
     MINUTES = 1
     hashtags = ['ripstevejobs']
     map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag = defaultdict(dict)
     for hashtag in hashtags:
         for hashtag_object in FileIO.iterateJsonFromFile('./data/%s.json'%hashtag):
             map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time =  getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtag_object), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False)
             tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time = sorted(map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time.iteritems(), key=itemgetter(0))
             epoch_starting_time_unit = tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time[0][0]
             epoch_ending_time_unit = epoch_starting_time_unit+1*60*60
             for epoch_time_unit, tuples_of_location_and_epoch_occurrence_time in tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time:
                 if epoch_time_unit<=epoch_ending_time_unit:
                     if tuples_of_location_and_epoch_occurrence_time:
                         epoch_lag = epoch_time_unit - epoch_starting_time_unit
                         tuples_of_location_and_epoch_occurrence_time = sorted(tuples_of_location_and_epoch_occurrence_time, key=itemgetter(1))
                         map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag][hashtag] = [(getLatticeLid(location, 0.145), epoch_occurrence_time-epoch_starting_time_unit)for location, epoch_occurrence_time in tuples_of_location_and_epoch_occurrence_time]
     map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag = defaultdict(list)
     GeneralMethods.runCommand('rm -rf ./images/plot_maps_for_every_minute/')
     for epoch_lag in sorted(map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag):
         file_world_map_plot = './images/plot_maps_for_every_minute/%s.png'%(epoch_lag)
         print file_world_map_plot
         map_from_hashtag_to_tuples_of_location_and_epoch_lag = map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag]
         for hashtag, tuples_of_location_and_epoch_lag in map_from_hashtag_to_tuples_of_location_and_epoch_lag.iteritems():
             map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag[hashtag]+=tuples_of_location_and_epoch_lag
         for hashtag, accumulated_tuples_of_location_and_epoch_lag in map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag.iteritems():
             tuples_of_location_and_epoch_max_lag= [(location, max(zip(*iterator_of_tuples_of_location_and_epoch_lag)[1]))
                                for location, iterator_of_tuples_of_location_and_epoch_lag in 
                                groupby(sorted(accumulated_tuples_of_location_and_epoch_lag, key=itemgetter(0)), key=itemgetter(0))
                             ]
             locations, colors = zip(*[(getLocationFromLid(location.replace('_', ' ')), (epoch_lag+MINUTES*60)-epoch_max_lag) for location, epoch_max_lag in sorted(tuples_of_location_and_epoch_max_lag, key=itemgetter(1), reverse=True)])
             plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, cmap=matplotlib.cm.cool, lw = 0, vmax=epoch_lag+MINUTES*60)
             plt.title('%s      (%s minutes)'%(hashtag, (epoch_lag+MINUTES*60)/(60.)))
 #        plt.show()
         FileIO.createDirectoryForFile(file_world_map_plot)
         plt.savefig(file_world_map_plot)
         plt.clf()
Example #21
0
    def temp():
        hashtags, MINUTES = [], 60
        for hashtagObject in FileIO.iterateJsonFromFile('americanhorrorstory'):
            if hashtagObject['h']=='americanhorrorstory':
                print unicode(hashtagObject['h']).encode('utf-8'), len(hashtagObject['oc'])
                occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject, timeUnit=60*60), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False)
                totalOccurances = []
                for interval, t in enumerate(sorted(occsDistributionInTimeUnits)):
                    occs = occsDistributionInTimeUnits[t]
                    if occs:
                        fileName = '../images/plotsOnMap/%s/%s.png'%(hashtagObject['h'], (interval+1)*MINUTES); FileIO.createDirectoryForFile(fileName)
#                        print interval, t, len(occs)
                        print fileName
                        occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), 'm') for lid, occ in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in occs], key=itemgetter(0)), key=itemgetter(0))]
                        occurancesGroupedByLattice = sorted(occurancesGroupedByLattice, key=itemgetter(1))
                        points, colors = zip(*occurancesGroupedByLattice)
                        plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0)
#                        plt.show()
                        plt.savefig(fileName)
                        plt.clf()
                exit()
def plotDistributionGraphs(occurences, validTimeUnits, title, startingEpoch=None):
        occurences = getOccurencesFilteredByDistributionInTimeUnits(occurences, validTimeUnits)
        occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), sorted(zip(*occs)[1])) for lid, occs in groupby(sorted([(getLatticeLid(l, ACCURACY), t) for l, t in occurences], key=itemgetter(0)), key=itemgetter(0))]
        plt.subplot(211)
        pointsForNumberOfOccurances, numberOfOccurancesList = zip(*sorted(occurancesGroupedByLattice, key=lambda t: len(t[1])))
        numberOfOccurancesList = [len(ocs) for ocs in numberOfOccurancesList]
        cm = matplotlib.cm.get_cmap('cool')
        sc = plotPointsOnWorldMap(pointsForNumberOfOccurances, c=numberOfOccurancesList, cmap=cm, lw = 0, alpha=1.0)
        plt.colorbar(sc), plt.title(title), plt.xlabel('Number of mentions')
        
        plt.subplot(212)
        pointsForNumberOfOccurances, occuranceTime = zip(*sorted(occurancesGroupedByLattice, key=lambda t: min(t[1]), reverse=True))
        occuranceTime=[min(t) for t in occuranceTime]
        if not startingEpoch: startingEpoch = occuranceTime[-1]
        occuranceTime=[(t-startingEpoch)/TIME_UNIT_IN_SECONDS for t in occuranceTime]
        cm = matplotlib.cm.get_cmap('autumn')
        sc = plotPointsOnWorldMap(pointsForNumberOfOccurances, c=occuranceTime, cmap=cm, lw = 0, alpha=1.0)
        plt.colorbar(sc), plt.xlabel('Speed of hashtag arrival')
        return startingEpoch
Example #23
0
 def map_rawData_to_reducedlatticeObjectUnits(self, key, line):
     data = getCheckinObject(line)
     data['u'] = data['user']['id']
     for k in ['tx', 'user']: del data[k] 
     yield getLatticeLid(data['l'], accuracy=ACCURACY), data
Example #24
0
 def map_rawData_to_latticeObjectUnits(self, key, line):
     data = getCheckinObject(line)
     yield getLatticeLid(data['l'], accuracy=ACCURACY), data
Example #25
0
def plotHastagClasses(timeRange, folderType):
    def getFileName():
        for i in combinations('abcedfghijklmnopqrstuvwxyz',2): yield ''.join(i)+'.png'
    count=1
#    for hashtagObject in FileIO.iterateJsonFromFile(hashtagsWithoutEndingWindowFile%(folderType,'%s_%s'%timeRange)):
    for hashtagObject in FileIO.iterateJsonFromFile(hashtagsFile%('testing_world','%s_%s'%(2,11))):
#        HashtagsClassifier.classify(hashtagObject)
        print count; count+=1
#        if hashtagObject['h']=='ripamy':
        classId = HashtagsClassifier.classify(hashtagObject)
        if classId!=None:
            classId = 1
            outputFile = hashtagsImagesHashtagsClassFolder%folderType+'%s/%s.png'%(classId, hashtagObject['h']); FileIO.createDirectoryForFile(outputFile)
            fileNameIterator = getFileName()
            timeUnits, timeSeries = getTimeUnitsAndTimeSeries(hashtagObject['oc'], timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)
            occurancesInActivityRegions = [[getOccuranesInHighestActiveRegion(hashtagObject), 'm']]
#            for hashtagPropagatingRegion in HashtagsClassifier._getActivityRegionsWithActivityAboveThreshold(hashtagObject):
#                validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)]
#                occurancesInActiveRegion = [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS) in validTimeUnits]
#                occurancesInActivityRegions.append([occurancesInActiveRegion, GeneralMethods.getRandomColor()])
            
            currentMainRangeId = 0
            for occurances1, color1 in occurancesInActivityRegions:
#                outputFile=outputFolder+fileNameIterator.next();FileIO.createDirectoryForFile(outputFile)
                print outputFile
                ax = plt.subplot(312)
                subRangeId = 0
                for occurances, color in occurancesInActivityRegions:
                    if subRangeId==currentMainRangeId: color='m'
                    timeUnits, timeSeries = getTimeUnitsAndTimeSeries(occurances, timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)
#                    if len(timeUnits)<24: 
#                        difference = 24-len(timeUnits)
#                        timeUnits=list(timeUnits)+[timeUnits[-1]+(i+1)*HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS for i in range(difference)]
#                        timeSeries=list(timeSeries)+[0 for i in range(difference)]
#                    print len(timeUnits[:24]), len(timeSeries[:24])
                    plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-o', c=color)
                    subRangeId+=1
#                plt.ylim(ymax=1)
                plt.setp(ax.get_xticklabels(), rotation=10, fontsize=7)
            
                ax=plt.subplot(313)
                subRangeId = 0
                timeUnits, timeSeries = getTimeUnitsAndTimeSeries(hashtagObject['oc'], timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)
                plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-')
                for occurances, color in occurancesInActivityRegions:
                    if subRangeId==currentMainRangeId: color='m'
                    timeUnits, timeSeries = getTimeUnitsAndTimeSeries(occurances, timeUnit=HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)
                    plt.plot_date([datetime.datetime.fromtimestamp(t) for t in timeUnits], timeSeries, '-o', c=color)
                    subRangeId+=1
                plt.setp(ax.get_xticklabels(), rotation=10, fontsize=7)
                
                plt.subplot(311)
                occurancesGroupedByLattice = sorted(
                                                    [(getLocationFromLid(lid.replace('_', ' ')), len(list(occs))) for lid, occs in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in occurances1], key=itemgetter(0)), key=itemgetter(0))],
                                                    key=itemgetter(1)
                                                    )
                points, colors = zip(*occurancesGroupedByLattice)
                cm = matplotlib.cm.get_cmap('cool')
                if len(points)>1: 
                    sc = plotPointsOnWorldMap(points, c=colors, cmap=cm, lw=0, alpha=1.0)
                    plt.colorbar(sc)
                else: sc = plotPointsOnWorldMap(points, c='m', lw=0)
                plt.title(hashtagObject['h']+ '(%d)'%len(occurancesGroupedByLattice))
#                plt.show()
                try:
                    plt.savefig(outputFile); plt.clf()
                except: pass
                currentMainRangeId+=1
Example #26
0
 def plotGraphsForHashtag(hashtag):
     for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'):
         MINUTES = 5
         if hashtagObject['h']==hashtag:
             print unicode(hashtagObject['h']).encode('utf-8'), len(hashtagObject['oc'])
             occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False)
             totalOccurances = []
             for interval, t in enumerate(sorted(occsDistributionInTimeUnits)):
                 occs = occsDistributionInTimeUnits[t]
                 totalOccurances+=occs
                 if occs:
                     fileName = '../images/plotsOnMap/%s/%s.png'%(hashtagObject['h'], (interval+1)*MINUTES); FileIO.createDirectoryForFile(fileName)
                     print fileName
                     occurancesGroupedByLattice = [(getLocationFromLid(lid.replace('_', ' ')), 'm') for lid, occs in groupby(sorted([(getLatticeLid(l, LATTICE_ACCURACY), t) for l, t in totalOccurances], key=itemgetter(0)), key=itemgetter(0))]
                     occurancesGroupedByLattice = sorted(occurancesGroupedByLattice, key=itemgetter(1))
                     points, colors = zip(*occurancesGroupedByLattice)
                     plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=colors, lw = 0)
                     plt.show()
 #                    plt.savefig(fileName)
                     plt.clf()
                 if (interval+1)*MINUTES>=120: break
             break
Example #27
0
 def mapper_hashtag_object_to_tuo_location_and_tuo_hashtag_and_occurrence_time(self, key, hashtag_object):
     if False:
         yield  # I'm a generator!
     for point, t in hashtag_object["oc"]:
         location = getLatticeLid(point, LOCATION_ACCURACY)
         self.mf_location_to_tuo_hashtag_and_occurrence_time[location].append([hashtag_object["h"], t])
 def get_tuo_hashtag_and_ltuo_occurrence_time_and_locations(mf_hashtag_to_ltuo_point_and_occurrence_time, top_hashtags):
     hashtag_and_ltuo_occurrence_time_and_locations = \
         [(
           top_hashtag.split()[0],
           [ (GeneralMethods.approximateEpoch(occurrence_time, UNIT_TIME_UNIT_IN_SECONDS), getLatticeLid(point, UNIT_LATTICE_ACCURACY))
                  for point, occurrence_time in mf_hashtag_to_ltuo_point_and_occurrence_time[top_hashtag.split()[0]]
              ]
           )
             for top_hashtag in top_hashtags
         ]
     tuo_hashtag_and_ltuo_occurrence_time_and_locations = []
     for hashtag, ltuo_occurrence_time_and_locations in hashtag_and_ltuo_occurrence_time_and_locations:
         ltuo_occurrence_time_and_locations = [(occurrence_time, zip(*ito_ltuo_occurrence_time_and_locations)[1])
             for occurrence_time, ito_ltuo_occurrence_time_and_locations in 
                 groupby(
                         sorted(ltuo_occurrence_time_and_locations, key=itemgetter(0)),
                         key=itemgetter(0)
                 )
          ]
         tuo_hashtag_and_ltuo_occurrence_time_and_locations.append((hashtag, ltuo_occurrence_time_and_locations))
     return tuo_hashtag_and_ltuo_occurrence_time_and_locations
Example #29
0
 def get_larger_lid(lid): return getLatticeLid(getLocationFromLid(lid.replace('_', ' ')), 10)
 for model_id in model_ids:
Example #30
0
    def buildLatticeGraphMap(self, key, hashtagObject):
        def getOccurranceDistributionInEpochs(occ, timeUnit=TIME_UNIT_IN_SECONDS, fillInGaps=False, occurancesCount=True): 
            if occurancesCount: occurranceDistributionInEpochs = filter(lambda t:t[1]>2, [(k[0], len(list(k[1]))) for k in groupby(sorted([GeneralMethods.approximateEpoch(t, timeUnit) for t in zip(*occ)[1]]))])
            else: occurranceDistributionInEpochs = filter(lambda t:len(t[1])>2, [(k[0], [t[1] for t in k[1]]) for k in groupby(sorted([(GeneralMethods.approximateEpoch(t[1], timeUnit), t) for t in occ], key=itemgetter(0)), key=itemgetter(0))])
            if not fillInGaps: return occurranceDistributionInEpochs
            else:
                if occurranceDistributionInEpochs:
                    startEpoch, endEpoch = min(occurranceDistributionInEpochs, key=itemgetter(0))[0], max(occurranceDistributionInEpochs, key=itemgetter(0))[0]
        #            if not occurancesCount: startEpoch, endEpoch = startEpoch[0], endEpoch[0]
                    dataX = range(startEpoch, endEpoch, timeUnit)
                    occurranceDistributionInEpochs = dict(occurranceDistributionInEpochs)
                    for x in dataX: 
                        if x not in occurranceDistributionInEpochs: 
                            if occurancesCount: occurranceDistributionInEpochs[x]=0
                            else: occurranceDistributionInEpochs[x]=[]
                    return occurranceDistributionInEpochs
                else: return dict(occurranceDistributionInEpochs)
        def getActiveRegions(timeSeries):
            noOfZerosObserved, activeRegions = 0, []
            currentRegion, occurancesForRegion = None, 0
            for index, l in zip(range(len(timeSeries)),timeSeries):
                if l>0: 
                    if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION or index==0:
                        currentRegion = [None, None, None]
                        currentRegion[0] = index
                        occurancesForRegion = 0
                    noOfZerosObserved = 0
                    occurancesForRegion+=l
                else: 
                    noOfZerosObserved+=1
                    if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION and currentRegion and currentRegion[1]==None:
                        currentRegion[1] = index-MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION-1
                        currentRegion[2] = occurancesForRegion
                        activeRegions.append(currentRegion)
            if not activeRegions: activeRegions.append([0, len(timeSeries)-1, sum(timeSeries)])
            else: 
                currentRegion[1], currentRegion[2] = index, occurancesForRegion
                activeRegions.append(currentRegion)
            return activeRegions
        def getOccuranesInHighestActiveRegion(hashtagObject, checkIfItFirstActiveRegion=False, timeUnit=TIME_UNIT_IN_SECONDS, maxLengthOfHighestActiveRegion=None):
            occurancesInActiveRegion, timeUnits = [], []
            occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject['oc'], fillInGaps=True)
            if occurranceDistributionInEpochs:
                timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0)))
                hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2))
                if not maxLengthOfHighestActiveRegion: validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)]
                else: validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)][:maxLengthOfHighestActiveRegion]
                occurancesInActiveRegion = [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, timeUnit) in validTimeUnits]
            if not checkIfItFirstActiveRegion: return occurancesInActiveRegion
            else:
                isFirstActiveRegion=False
                if timeUnits and timeUnits[0]==validTimeUnits[0]: isFirstActiveRegion=True
                return (occurancesInActiveRegion, isFirstActiveRegion)
        def filterLatticesByMinHashtagOccurencesPerLattice(h):
            latticesToOccurancesMap = defaultdict(list)
            for l, oc in h['oc']:
                lid = getLatticeLid(l, LOCATION_ACCURACY)
                if lid!='0.0000_0.0000': latticesToOccurancesMap[lid].append(oc)
            return dict([(k,v) for k, v in latticesToOccurancesMap.iteritems() if len(v)>=MIN_HASHTAG_OCCURENCES_PER_LATTICE])
        hashtagObject['oc']=getOccuranesInHighestActiveRegion(hashtagObject)
        lattices = filterLatticesByMinHashtagOccurencesPerLattice(hashtagObject).keys()
#        latticesToOccranceTimeMap = {}
#        for k, v in hashtagObject['oc']:
#            lid = getLatticeLid(k, LOCATION_ACCURACY)
#            if lid!='0.0000_0.0000' and lid in lattices:
#                if lid not in latticesToOccranceTimeMap: latticesToOccranceTimeMap[lid]=v
        ###
        
        latticesToOccranceTimeMap = defaultdict(list)
        for k, v in hashtagObject['oc']:
            lid = getLatticeLid(k, LOCATION_ACCURACY)
            if lid!='0.0000_0.0000' and lid in lattices:
                latticesToOccranceTimeMap[lid].append(v)
        
        ###
        lattices = latticesToOccranceTimeMap.items()
        if lattices:
#            hastagStartTime, hastagEndTime = min(lattices, key=lambda (lid, occurrences): min(occurrences) )[1], max(lattices, key=lambda (lid, occurrences): max(occurrences) )[1]
#            hastagStartTime, hastagEndTime = min(hastagStartTime), max(hastagEndTime)
#            hashtagTimePeriod = hastagEndTime - hastagStartTime
            hashtagTimePeriod = None
            for lattice in lattices: 
                yield lattice[0], ['h', [[hashtagObject['h'], [lattice[1], hashtagTimePeriod]]]]
                yield lattice[0], ['n', lattices]