def test_append(self):
     self.crowd.append(self.cluster, test_time+timedelta(days=1))
     self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys()))
     self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)]))
     self.assertEqual(2, self.crowd.lifespan)
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime))
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
Example #3
0
    def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag):
        for model_id in models_ids:
#            if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
#            else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag)
            output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
            GeneralMethods.runCommand('rm -rf %s'%output_file)
            for line_count, location_object in enumerate(iterateJsonFromFile(
                     location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                     )):
                print line_count, model_id
                tuo_neighbor_location_and_pure_influence_score = []
                location_hashtag_set = set(location_object['hashtags'])
                for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems():
                    pure_influence_scores = []
                    for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems():
                        if hashtag in location_object['hashtags']:
                            location_occurrences = location_object['hashtags'][hashtag][0]
                            pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences))
                    neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys())
                    if hashtag_tag==w_extra_hashtags_tag:
                        for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0)
                        for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0)
                    mean_pure_influence_score = np.mean(pure_influence_scores)
                    tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score])
                tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1))
                FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
Example #5
0
 def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder):
     def location_similarity(location_vector_1, location_vector_2): 
         return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.)
     influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]
     for model_id in model_ids:
         mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id))
         GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)
         for line_count, location_object in enumerate(iterateJsonFromFile(
                      location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                  )):
             print line_count
             location = location_object['id']
             tuo_neighbor_location_and_mf_influence_type_and_similarity = []
             for neighbor_location in location_object['links'].keys(): 
                 mf_influence_type_and_similarity = {}
                 for influence_type in influence_types:
                     similarity = location_similarity( 
                                                          mf_location_to_mf_influence_type_to_influence_vector[location][influence_type],
                                                          mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type]
                                                   )
                     mf_influence_type_and_similarity[influence_type] = similarity
                 so_hashtags_for_location = set(location_object['hashtags'].keys())
                 so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys())
                 numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0.
                 denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0.
                 mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator                
                 tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity])
             FileIO.writeToFileAsJson(
                                      [location, tuo_neighbor_location_and_mf_influence_type_and_similarity],
                                      tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id
                                      )
Example #6
0
 def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids):
     for test_model_id in test_models_ids:
         output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id)
         GeneralMethods.runCommand('rm -rf %s'%output_file)
         ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time()
         for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\
                 enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time):
             ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1]))
                                                     for location, ito_location_and_occurrence_time in
                                                         groupby(
                                                                 sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)),
                                                                 key=itemgetter(0)
                                                         )
                                                 ] 
             print hashtag_count, test_model_id
             ltuo_location_and_pure_influence_score = []
             for location, location_occurrence_times in ltuo_location_and_occurrence_times:
                 pure_influence_scores = []
                 for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times:
                     if location!=neighbor_location:
                         pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times)
                         pure_influence_scores.append(pure_influence_score)
                 ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)])
             ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1))
             FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
Example #7
0
    def combineLocationGraphs(graphMap, startingGraphId, startingTime, intervalInSeconds, linear=True, **kwargs):
        if intervalInSeconds%TIME_UNIT_IN_SECONDS==0 and int(intervalInSeconds/TIME_UNIT_IN_SECONDS)!=0: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS)
        else: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS)+1
        graphId = GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(startingTime), TIME_UNIT_IN_SECONDS)
        currentLogarithmicId = LocationGraphs.getLogarithmicGraphId(startingGraphId, graphId)
        currentCollectedGraphs = 0
        graphIdsToCombine = []
        while currentCollectedGraphs!=numberOfGraphs and currentLogarithmicId>0:
            numberOfGraphsToCollect = 2**int(math.log(numberOfGraphs-currentCollectedGraphs,2))
            if not linear and currentLogarithmicId%2==0: 
                indices = [1]+map(lambda j: 2**j, filter(lambda j: currentLogarithmicId%(2**j)==0, range(1, int(math.log(currentLogarithmicId+1,2))+1)))
                if max(indices)>numberOfGraphsToCollect and numberOfGraphsToCollect in indices: index = numberOfGraphsToCollect
                else: index = max(indices)
            else: index=1
            logGraphId = '%s_%s'%(LocationGraphs.getGraphId(startingGraphId, currentLogarithmicId), index)
            if logGraphId in graphMap: graphIdsToCombine.append(logGraphId)
            currentLogarithmicId-=index
            currentCollectedGraphs+=index
        graphIdsToCombine = sorted(graphIdsToCombine, key=lambda id:int(id.split('_')[1]), reverse=True)
#        print graphIdsToCombine
#        for i in graphIdsToCombine:
#            ep, l = i.split('_')
#            print i, datetime.datetime.fromtimestamp(float(ep)), l, graphMap[i].number_of_nodes()
        graphsToCombine = [graphMap[id] for id in graphIdsToCombine]
        return combineGraphList(graphsToCombine, **kwargs)
Example #8
0
    def analyzeQuality(graphs, graphType):
        def getQualityScore(graphMap, edgesToKeep, timeDifference):
            dataToReturn = []
            for j, intervalInSeconds in enumerate([1]):
                intervalInSeconds*=timeDifference
                linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep)
                logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep)
                linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters)
                print intervalInSeconds, edgesToKeep, score
                dataToReturn.append(score)
            return dataToReturn
        graphFile = qualityMetricsFolder%graphType
        print graphFile
        GeneralMethods.runCommand('rm -rf %s'%graphFile)
        for edgesToKeep in range(1,11): 
#        for edgesToKeep in [1,10]: 
            edgesToKeep*=0.1
            graphMap = dict(graphs[:])
            startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys())
            timeDifference = endingGraphId-startingGraphId
            LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep)
#            print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}
            FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
Example #9
0
def trendCurves():
    model = MixedUsersModel()
    experimentFileName = spamModelFolder+model.id
    conf = {'model': model, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.trendCurves, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015},
            'experimentFileName': experimentFileName}
    GeneralMethods.runCommand('rm -rf %s'%experimentFileName); run(**conf)
    Analysis.trendCurves(experimentFileName=experimentFileName)
 def mapper(self, key, hashtag_object):
     ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location']
     if ltuo_occ_time_and_occ_location:
         ltuo_intvl_time_and_occ_location = [(
                                            GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS),
                                            occ_location
                                             ) 
                                           for occ_time, occ_location in ltuo_occ_time_and_occ_location]
         ltuo_intvl_time_and_items =\
                                 GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0))
         ltuo_intvl_time_and_items.sort(key=itemgetter(0))
         first_time = ltuo_intvl_time_and_items[0][0]
         intvl_method = lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, (t, len(it)))
         ltuo_iid_and_tuo_interval_and_occurrence_count = map(intvl_method, ltuo_intvl_time_and_items)
         peak_tuo_iid_and_tuo_interval_and_occurrence_count = \
                                                         max(
                                                             ltuo_iid_and_tuo_interval_and_occurrence_count,
                                                             key=lambda (_, (__, occurrence_count)): occurrence_count
                                                         )
         peak_iid = peak_tuo_iid_and_tuo_interval_and_occurrence_count[0]
         current_val = 0.0
         total_occurrences = sum(data[1][1] for data in ltuo_iid_and_tuo_interval_and_occurrence_count)
         for iid, (_, occurrence_count) in ltuo_iid_and_tuo_interval_and_occurrence_count:
             is_peak = 0.0
             if iid==peak_iid: is_peak=1.0
             current_val+=occurrence_count
             yield iid, [is_peak, occurrence_count/total_occurrences, current_val/total_occurrences]
 def mapper(self, key, hashtag_object):
     hashtag = hashtag_object['hashtag']
     ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location']
     if ltuo_occ_time_and_occ_location:
         ltuo_intvl_time_and_occ_location = [(
                                            GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS),
                                            occ_location
                                             ) 
                                           for occ_time, occ_location in ltuo_occ_time_and_occ_location]
         points = [UTMConverter.getLatLongUTMIdInLatLongForm(loc) for _, loc in ltuo_occ_time_and_occ_location]
         ltuo_intvl_time_and_items =\
                                 GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0))
         ltuo_intvl_time_and_items.sort(key=itemgetter(0))
         first_time = ltuo_intvl_time_and_items[0][0]
         ltuo_iid_and_occ_count = map(lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, len(it)), ltuo_intvl_time_and_items)
         ltuo_location_and_items =\
                                 GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(1))
         mf_location_to_occ_count = dict(map(lambda (l, it): (l, len(it)), ltuo_location_and_items))
         spatial_metrics = {
                              'hashtag': hashtag,
                              'num_of_occurrenes': len(ltuo_occ_time_and_occ_location),
                              'peak_iid': max(ltuo_iid_and_occ_count, key=itemgetter(1))[0],
                              'focus': focus(mf_location_to_occ_count),
                              'entropy': entropy(mf_location_to_occ_count, as_bits=False),
                              'spread': getRadiusOfGyration(points)
                          }
         yield hashtag, spatial_metrics
Example #12
0
 def writeUserClustersFile(place):
     print 'Generating clusters...'
     userVectors = GenerateDataFiles.getUserVectors(place)
     GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name'])
     clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1')
 #    clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2')
     for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]}
     for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
Example #13
0
def performanceWithSpamFilteringForLatestMessages(generateData):
    experimentData = defaultdict(dict)
    for iteration in range(10):
#        for spammerPercentage in range(1,21):
##            spammerPercentage = 20
#            spammerPercentage = spammerPercentage*0.05
#        for spammerPercentage in range(1,11):
#            spammerPercentage = spammerPercentage*0.02
#        for spammerPercentage in range(1,201):
#            spammerPercentage = spammerPercentage* 0.005
        l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)]
        l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)]
        l3 = [0.01]+l2
        for spammerPercentage in l1:
            experimentFileName = spamModelFolder+'performanceWithSpamFilteringForLatestMessages/%s/%0.3f'%(iteration,spammerPercentage)
            print experimentFileName
            if generateData:
                model = MixedUsersModel()
                conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered],
                        'experimentFileName': experimentFileName,
#                        'noOfPayloadsPerSpammer': 1, 'noOfTopics': 10
                        }
                
#                conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
#                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages],
#                        'experimentFileName': experimentFileName}
                
                GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf)
            else:
                tempData = defaultdict(list)
                for data in FileIO.iterateJsonFromFile(experimentFileName):
                    for ranking_id in data['spammmess']:
                        tempData[ranking_id]+=data['spammmess'][ranking_id]
                experimentData[iteration][spammerPercentage]=tempData
    if not generateData:
        realDataY = defaultdict(dict)
        for iteration in experimentData:
            dataY = defaultdict(list)
            dataX = []
            for perct in sorted(experimentData[iteration]):
                dataX.append(perct)
                for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values))
            dataX=sorted(dataX)
            for ranking_id in dataY:
                for x, y in zip(dataX, dataY[ranking_id]): 
                    if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] 
                    realDataY[ranking_id][x].append(y)
        for ranking_id in dataY: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id])
        plt.xlabel('Percentage of Spammers', fontsize=16, fontweight='bold')
        plt.ylabel('Spamness', fontsize=16, fontweight='bold')
#        plt.title('Performance with spam filtering')
        plt.legend(loc=2)
#        plt.show()
        plt.xlim(xmax=0.05)
        plt.savefig('performanceWithSpamFilteringForLatestMessages.png')
        plt.clf()
Example #14
0
 def messageSelectionMethod(self, currentTimeStep, user, currentTopics, **conf):
     message = None
     if GeneralMethods.trueWith(user.messagingProbability):
         if GeneralMethods.trueWith(user.newTopicProbability):
             topic = Topic(len(currentTopics))
             currentTopics.append(topic)
             message = user.generateMessage(currentTimeStep, topic)
         else:
             message = user.generateMessage(currentTimeStep, random.choice(currentTopics))
     return message
Example #15
0
def performanceWithSpamDetection(generateData):
    experimentData = defaultdict(dict)
    ratios = [0.0,0.4,0.9]
    marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')])
#    spammerPercentages = [0.2, 0.01, 0.01]
    spammerPercentages = [0.015, 0.015, 0.015]
    for iteration in range(10):
        for spamDetectionRatio, spammerPercentage in zip(ratios, spammerPercentages):
            experimentFileName = spamModelFolder+'performanceWithSpamDetection/%s/%0.3f'%(iteration,spamDetectionRatio)
            print experimentFileName
            if generateData:
                model = MixedUsersModel()
                conf = {'model': model, 'numberOfTimeSteps': 100, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
    #                        'spammerMessagingProbability': spammerBudget,
                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered],
                        'spamDetectionRatio': spamDetectionRatio,
                        'experimentFileName': experimentFileName}
                GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf)
            else:
                for data in FileIO.iterateJsonFromFile(experimentFileName):
                    for ranking_id in data['spammmess']:
                        if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list)
                        experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id]
    if not generateData:
        sdr = {}
        for spamDetectionRatio in sorted(experimentData.keys()):
            dataToPlot = defaultdict(list)
            for timeUnit in experimentData[spamDetectionRatio]:
                dataToPlot['x'].append(timeUnit)
                for ranking_id in experimentData[spamDetectionRatio][timeUnit]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][timeUnit][ranking_id]))
            sdr[spamDetectionRatio]=dataToPlot
        for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]:
#        for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]:
            for spamDetectionRatio in ratios:
                print ranking_id, spamDetectionRatio
                dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])]
                dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:]
                print 'x', [x-10 for x in dataX]
                if spamDetectionRatio==0.0: 
                    print ranking_id, dataY
                    plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio])
                else: 
                    print ranking_id, dataY
                    plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio])
            plt.ylim(ymin=0, ymax=1)
            plt.xlim(xmin=0, xmax=75)
#            plt.title(ranking_id)
            plt.legend()
            plt.xlabel('Time', fontsize=16, fontweight='bold')
            plt.ylabel('Spamness', fontsize=16, fontweight='bold')
#            plt.show()
#            plt.savefig('performanceWithSpamDetection_%s.png'%ranking_id)
            savefig('performanceWithSpamDetection_%s.png'%ranking_id)
            plt.clf()
Example #16
0
 def __init__(self, id):
     self.id = id
     self.totalCount = 0
     self.countDistribution = defaultdict(int)
     self.age = 0
     self.topicClass = random.choice(topicClasses)
     self.decayCoefficient = -3
     if GeneralMethods.trueWith(0.05): self.stickiness = random.uniform(stickinessLowerThreshold, 1.0)
     else: self.stickiness = random.uniform(0.0, 0.1)
     self.payloads = PayLoad.generatePayloads(self.id, noOfPayloadsPerTopic)
     #Non-modeling attributes.
     self.color = GeneralMethods.getRandomColor()
 def reducer(self, location, it_performance_values):
     performance_values = list(chain(*it_performance_values))
     performance_summary = defaultdict(list)
     for prediction_method, pvs_for_prediction_method in \
                             GeneralMethods.group_items_by(performance_values, key=itemgetter('prediction_method')):
         for metric, pvs_for_prediction_method_and_metric in \
                         GeneralMethods.group_items_by(pvs_for_prediction_method, key=itemgetter('metric')):
             performance_summary[metric].append([
                                                 prediction_method,
                                                 pvs_for_prediction_method_and_metric[0]['metric_value']
                                         ])
     yield '', dict(location=location, performance_summary=performance_summary)
Example #18
0
def writeARFFFile(place):
    userVectors = defaultdict(dict)
    locationToUserMap = dict((l['location'], l) for l in locationToUserMapIterator(place, minCheckins=50))
    for lid in locationToUserMap:
        for user in locationToUserMap[lid]['users']: 
            userVectors[user][lid.replace(' ', '_')]=sum(len(locationToUserMap[lid]['users'][user][d][db]) for d in locationToUserMap[lid]['users'][user] for db in locationToUserMap[lid]['users'][user][d])
    for user in userVectors.keys()[:]: 
        if sum(userVectors[user].itervalues())<place['minUserCheckins']: del userVectors[user]
    arffFile=ARFF.writeARFFForClustering(userVectors, place['name'])
    outputFileName = getARFFFileName(place)
    FileIO.createDirectoryForFile(outputFileName)
    GeneralMethods.runCommand('mv %s %s'%(arffFile, outputFileName))
Example #19
0
    def messageSelectionMethod(self, currentTimeStep, user, currentTopics, **conf):
        if self.lastObservedTimeStep!=currentTimeStep: self._updateTopicProbabilities(currentTimeStep, currentTopics, **conf)
        message = None
        if GeneralMethods.trueWith(user.messagingProbability):
            if GeneralMethods.trueWith(user.newTopicProbability): topic = Topic(len(currentTopics)); currentTopics.append(topic); message=user.generateMessage(currentTimeStep, topic)
            else: 
                if GeneralMethods.trueWith(user.probabilityOfPickingPopularTopic):
                        if user.topicClass!=None:
                            topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.topicProbabilities[user.topicClass]])
                            topic = self.topicProbabilities[user.topicClass][topicIndex][0]
                            message=user.generateMessage(currentTimeStep, topic)
                            if not GeneralMethods.trueWith(topic.stickiness): message = None
                        else: 
                            topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.topTopics])
                            topic = self.topTopics[topicIndex][0]
                            message=user.generateMessage(currentTimeStep, topic)
                else: 
                    if user.topicClass!=None: 
                        stickinesses = [topic[0].stickiness for topic in self.topicProbabilities[user.topicClass]]
                        total_stickiness = sum(stickinesses)
                        stickinesses = [s/total_stickiness for s in stickinesses]
                        topicIndex = GeneralMethods.weightedChoice(stickinesses)
                        topic = self.topicProbabilities[user.topicClass][topicIndex][0]
#                        message=user.generateMessage(currentTimeStep, random.choice(self.topicProbabilities[user.topicClass])[0])
                        message=user.generateMessage(currentTimeStep, topic)
                    else:
#                        topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.allTopics])
                        stickinesses = [topic[0].stickiness for tc in self.topicProbabilities for topic in self.topicProbabilities[tc]]
                        total_stickiness = sum(stickinesses)
                        stickinesses = [s/total_stickiness for s in stickinesses]
                        topicIndex = GeneralMethods.weightedChoice(stickinesses)
#                        print len(self.allTopics), len(stickinesses)
                        topic = self.allTopics[topicIndex][0]
                        message=user.generateMessage(currentTimeStep, topic)
        return message
Example #20
0
 def writeLocationToUserMap(place):
     name, boundary = place['name'], place['boundary']
     GeneralMethods.runCommand('rm -rf %s'%placesLocationToUserMapFile%name)
     for location in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, inputFile=locationToUserAndExactTimeMapFile):
         lid=getLocationFromLid(location['location'])
         if isWithinBoundingBox(lid, boundary): 
             location['categories'] = ''; location['tags'] = ''; location['name']=''
             title = venuesCollection.find_one({'lid':location['location']})
             if title: location['name'] = unicode(title['n']).encode("utf-8")
             meta = venuesMetaDataCollection.find_one({'_id':location['location']})
             if meta: location['categories'] = unicode(meta['c']).encode("utf-8"); location['tags'] = unicode(meta['t']).encode("utf-8")
             for user in location['users'].keys()[:]: location['users'][str(user)]=location['users'][user]; del location['users'][user]
             location['noOfCheckins']=sum([len(epochs) for user, userVector in location['users'].iteritems() for day, dayVector in userVector.iteritems() for db, epochs in dayVector.iteritems()])
             if location['noOfCheckins']>place.get('minLocationCheckins',0): FileIO.writeToFileAsJson(location, placesLocationToUserMapFile%name)
Example #21
0
 def writeTopClusterFeatures(place):
     locationNames = {}
     def getLocationName(lid): 
         if lid not in locationNames:
             locationObject = venuesCollection.find_one({'lid':lid})
             if locationObject: locationNames[lid] = unicode(locationObject['n']).encode("utf-8")
             else: locationNames[lid] = ''
         return locationNames[lid]
     GeneralMethods.runCommand('rm -rf %s'%placesUserClusterFeaturesFile%place['name'])
     documents = [userVector.values() for user, userVector in FileIO.iterateJsonFromFile(placesUserClustersFile%place['name'])]
     for data in getTopFeaturesForClass(documents, 1000): 
         clusterId, features = data
         modifiedFeatures = []
         for feature in features: modifiedFeatures.append(list(feature) + [getLocationName(feature[0].replace('_', ' '))])
         FileIO.writeToFileAsJson([clusterId, GeneralMethods.getRandomColor(), modifiedFeatures], placesUserClusterFeaturesFile%place['name'])
Example #22
0
 def __init__(self, cluster, clusterFormationTime):
     self.crowdId = cluster.clusterId
     self.clusters = {
         GeneralMethods.getEpochFromDateTimeObject(clusterFormationTime):
         cluster
     }
     self.ends, self.inComingCrowds, self.outGoingCrowd = False, [], None
 def mapper(self, key, value):
     if False: yield # I'm a generator!
     hashtag_object = cjson.decode(value)
     if 'num_of_occurrences' in hashtag_object and\
             hashtag_object['num_of_occurrences'] >= MIN_HASHTAG_OCCURRENCES_FOR_PROPAGATION_ANALYSIS:
         ltuo_bucket_occ_time_and_occ_utm_id =\
                                     map(
                                            lambda (t, utm_id):
                                                 (GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS), utm_id),
                                            hashtag_object['ltuo_occ_time_and_occ_utm_id']
                                        )
         ltuo_bucket_occ_time_and_occ_utm_id.sort(key=itemgetter(1))
         ltuo_utm_id_and_bucket_occ_times =\
             [ (occ_utm_id,map(itemgetter(0), it_bucket_occ_time_and_occ_utm_id))
              for occ_utm_id, it_bucket_occ_time_and_occ_utm_id in
                 groupby(ltuo_bucket_occ_time_and_occ_utm_id, key=itemgetter(1))
             ]
         ltuo_utm_id_and_bucket_occ_times =\
                                         filter(
                                                lambda (_, occ_times): len(occ_times)>10,
                                                ltuo_utm_id_and_bucket_occ_times
                                            )
         for _, bucket_occ_times in ltuo_utm_id_and_bucket_occ_times:
             gap_perct = 0.05
             gaps = np.arange(gap_perct,1+gap_perct,gap_perct)
             bucket_occ_times = filter_outliers(bucket_occ_times)
             bucket_occ_times_at_gaps = get_items_at_gap(bucket_occ_times, gap_perct)
             start_time = float(bucket_occ_times_at_gaps[0])
             life_time = bucket_occ_times_at_gaps[-1] - start_time
             if life_time>0:
                 norm_num_of_occurrences =\
                                         map(lambda t: int(((t-start_time)/life_time)*100), bucket_occ_times_at_gaps)
                 for gap, norm_num_of_occurrence in zip(gaps, norm_num_of_occurrences):
                     self.mf_gap_to_norm_num_of_occurrences['%0.2f'%gap]+=norm_num_of_occurrence
Example #24
0
 def ef_plot():
     output_file = fld_data_analysis_results%GeneralMethods.get_method_id()+'.png'
     data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)]
     ltuo_hashtag_and_entropy_and_focus = map(itemgetter('hashtag', 'entropy', 'focus'), data)
     mf_norm_focus_to_entropies = defaultdict(list)
     for _, entropy, (_, focus) in ltuo_hashtag_and_entropy_and_focus:
         mf_norm_focus_to_entropies[round(focus, 2)].append(entropy)
     plt.figure(num=None, figsize=(6,3))
     x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies))
                                 for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems()
                                 if len(entropies)>0])
     plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
     plt.scatter(x_focus, y_entropy, s=50, lw=0, c='k')
     plt.xlim(xmin=-0.1, xmax=1.1)
     plt.ylim(ymin=-1, ymax=9)
     plt.xlabel('Mean hashtag focus')
     plt.ylabel('Mean hashtag entropy')
     plt.grid(True)
     savefig(output_file)
     ltuo_hashtag_and_r_entropy_and_focus =\
                                         sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True)
     ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2))
     hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0]
     print list(hashtags[:20])
     print list(reversed(hashtags))[:20]
Example #25
0
 def getOccuranesInHighestActiveRegion(
     hashtagObject,
     checkIfItFirstActiveRegion=False,
     timeUnit=TIME_UNIT_IN_SECONDS,
     maxLengthOfHighestActiveRegion=None,
 ):
     occurancesInActiveRegion, timeUnits = [], []
     occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject["oc"], fillInGaps=True)
     if occurranceDistributionInEpochs:
         timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0)))
         hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2))
         if not maxLengthOfHighestActiveRegion:
             validTimeUnits = [
                 timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1] + 1)
             ]
         else:
             validTimeUnits = [
                 timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1] + 1)
             ][:maxLengthOfHighestActiveRegion]
         occurancesInActiveRegion = [
             (p, t)
             for p, t in hashtagObject["oc"]
             if GeneralMethods.approximateEpoch(t, timeUnit) in validTimeUnits
         ]
     if not checkIfItFirstActiveRegion:
         return occurancesInActiveRegion
     else:
         isFirstActiveRegion = False
         if timeUnits and timeUnits[0] == validTimeUnits[0]:
             isFirstActiveRegion = True
         return (occurancesInActiveRegion, isFirstActiveRegion)
Example #26
0
    def temporal_affinity_vs_distance():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        DataAnalysis._plot_affinities('adoption_lag')
        plt.xlabel('Distance (miles)')
        plt.ylabel('Hashtag adoption lag (hours)')
#        plt.show()
        savefig(output_file)
Example #27
0
def writeLocationsWithClusterInfoFile(place):
    GeneralMethods.runCommand('rm -rf %s'%placesLocationWithClusterInfoFile%place['name'])
    for clustering in iteraterUserClusterings(place):
        dataToWrite, userClusterMap = {}, {}
        for clusterId, users in clustering[2]['clusters'].iteritems(): 
            for user in users: userClusterMap[user]=clusterId
        locationMap = defaultdict(dict)
        for location in locationToUserMapIterator(place):
            locationMap[location['location']] = {'name':unicode(location['name']).encode("utf-8"), 'checkins':defaultdict(list)}
            for user, userVector in location['users'].iteritems():
                if user in userClusterMap:
                    for day, dayVector in userVector.iteritems():
                        for db, epochs in dayVector.iteritems():
                            locationMap[location['location']]['checkins'][userClusterMap[user]]+=epochs
            dataToWrite[str(clustering[0])]=locationMap
        FileIO.writeToFileAsJson(dataToWrite,placesLocationWithClusterInfoFile%place['name']) 
Example #28
0
def getOccuranesInHighestActiveRegion(hashtagObject):
    def getActiveRegions(timeSeries):
        noOfZerosObserved, activeRegions = 0, []
        currentRegion, occurancesForRegion = None, 0
        for index, l in zip(range(len(timeSeries)),timeSeries):
            if l>0: 
                if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION or index==0:
                    currentRegion = [None, None, None]
                    currentRegion[0] = index
                    occurancesForRegion = 0
                noOfZerosObserved = 0
                occurancesForRegion+=l
            else: 
                noOfZerosObserved+=1
                if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION and currentRegion and currentRegion[1]==None:
                    currentRegion[1] = index-MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION-1
                    currentRegion[2] = occurancesForRegion
                    activeRegions.append(currentRegion)
        if not activeRegions: activeRegions.append([0, len(timeSeries)-1, sum(timeSeries)])
        else: 
            currentRegion[1], currentRegion[2] = index, occurancesForRegion
            activeRegions.append(currentRegion)
        return activeRegions
    occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject['oc'])
    startEpoch, endEpoch = min(occurranceDistributionInEpochs, key=itemgetter(0))[0], max(occurranceDistributionInEpochs, key=itemgetter(0))[0]
    dataX = range(startEpoch, endEpoch, TIME_UNIT_IN_SECONDS)
    occurranceDistributionInEpochs = dict(occurranceDistributionInEpochs)
    for x in dataX: 
        if x not in occurranceDistributionInEpochs: occurranceDistributionInEpochs[x]=0
    timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0)))
#    for k, v in zip(timeUnits, timeSeries):
#        print k, v
    hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2))
    validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)]
    return [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS) in validTimeUnits]
Example #29
0
    def iid_vs_cumulative_distribution_and_peak_distribution():
        TIME_UNIT_IN_SECONDS = 10.*60.
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        ltuo_iid_and_interval_stats = [data for data in 
                                        FileIO.iterateJsonFromFile(f_iid_spatial_metrics, remove_params_dict=True)]
        ltuo_s_iid_and_interval_stats = sorted(ltuo_iid_and_interval_stats, key=itemgetter(0))
        ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences = [(data[0], (data[1][0], data[1][2])) for data in ltuo_s_iid_and_interval_stats]
        total_peaks = sum([data[1][0] for data in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences])+0.0
        x_iids = []
        y_is_peaks = []
        z_cumulative_percentage_of_occurrencess = []
        for (iid, (is_peak, cumulative_percentage_of_occurrences)) in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences[:100]: 
            print (iid, (is_peak, cumulative_percentage_of_occurrences)) 
            x_iids.append((iid+1)*TIME_UNIT_IN_SECONDS/60)
            y_is_peaks.append(is_peak/total_peaks)
            z_cumulative_percentage_of_occurrencess.append(cumulative_percentage_of_occurrences)
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, y_is_peaks, marker='o', c='k')
        plt.ylabel('Distribution of hashtags')
        plt.xlabel('Hashtag peak (minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'peaks')
        plt.clf()
        plt.figure(num=None, figsize=(6,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, z_cumulative_percentage_of_occurrencess, lw=0, marker='o', c='k')
#        plt.xlabel('Minutes')
        plt.ylabel('CDF of occurrences')
        plt.xlabel('Time (Minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'cdf_occurrences_peak')
Example #30
0
 def significant_nei_utm_ids():
     output_folder = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'/%s.png'
     for i, data in enumerate(FileIO.iterateJsonFromFile(f_significant_nei_utm_ids, remove_params_dict=True)):
         utm_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(data['utm_id'])
         nei_utm_lat_longs = map(
                           lambda nei_utm_id: UTMConverter.getLatLongUTMIdInLatLongForm(nei_utm_id),
                           data['nei_utm_ids']
                         )
         if nei_utm_lat_longs:
             output_file = output_folder%('%s_%s'%(utm_lat_long))
             plotPointsOnWorldMap(nei_utm_lat_longs,
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#EA00FF',
                                  alpha=1.)
             _, m = plotPointsOnWorldMap([utm_lat_long],
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#2BFF00',
                                  s = 40,
                                  returnBaseMapObject=True,
                                  alpha=1.)
             for nei_utm_lat_long in nei_utm_lat_longs:
                 m.drawgreatcircle(utm_lat_long[1],
                                   utm_lat_long[0],
                                   nei_utm_lat_long[1],
                                   nei_utm_lat_long[0],
                                   color='#FFA600',
                                   lw=1.5,
                                   alpha=1.0)
             print 'Saving %s'%(i+1)
             savefig(output_file)
def getStreamStats(streamTweetsIterator):
    ''' 30-day
        Experts stats:
        # of users:  4804
        # of tweets:  1614510
        # of tweets per tu (mean, var):  186.497631974 7860.12570191
        
        Houston stats
        # of users:  107494
        # of tweets:  15946768
        # of tweets per tu (mean, var):  1730.33506944 4834419.37341
        
        10-day
        Experts stats
        # of users:  4674
        # of tweets:  608798
        # of tweets per tu (mean, var):  190.726190476 8132.75460228
        Houston stats
        # of users:  39618
        # of tweets:  2139829
        # of tweets per tu (mean, var):  619.163483796 94450.7334004

    '''
    numberOfTweets, users, distributionPerTU = 0, set(), defaultdict(int)
    for tweet in streamTweetsIterator: 
        users.add(tweet['user']['screen_name'])
        distributionPerTU[GeneralMethods.getEpochFromDateTimeObject(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))//300]+=1
        numberOfTweets+=1
    print '# of users: ', len(users)
    print '# of tweets: ', numberOfTweets 
    print '# of tweets per tu (mean, var): ', np.mean(distributionPerTU.values()), np.var(distributionPerTU.values())
Example #32
0
    def content_affinity_vs_distance():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        DataAnalysis._plot_affinities('similarity')
        plt.xlabel('Distance (miles)')
        plt.ylabel('Hashtags sharing similarity')
#        plt.show()
        savefig(output_file)
Example #33
0
 def addLocationPointsWithTitles(self, points, color=None):
     if not color:
         color = GeneralMethods.getRandomColor()
     for point, title in ((list(reversed(point)), title) for point, title in points):
         pnt = self.kml.newpoint(description=title, coords=[point])
         pnt.iconstyle.icon.href = "http://maps.google.com/mapfiles/kml/shapes/shaded_dot.png"
         pnt.iconstyle.color = "ff" + color[1:]
Example #34
0
 def test_append(self):
     self.crowd.append(self.cluster, test_time + timedelta(days=1))
     self.assertEqual([
         GeneralMethods.getEpochFromDateTimeObject(test_time),
         GeneralMethods.getEpochFromDateTimeObject(test_time +
                                                   timedelta(days=1))
     ], sorted(self.crowd.clusters.keys()))
     self.assertEqual(
         StreamCluster,
         type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(
             test_time)]))
     self.assertEqual(2, self.crowd.lifespan)
     self.assertEqual(
         getStringRepresentationForTweetTimestamp(test_time),
         getStringRepresentationForTweetTimestamp(self.crowd.startTime))
     self.assertEqual(
         getStringRepresentationForTweetTimestamp(test_time +
                                                  timedelta(days=1)),
         getStringRepresentationForTweetTimestamp(self.crowd.endTime))
    def sampleCrowds(self):
        # Set dates for experts as startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30) with a minimum of 7 users at a time.
        AnalyzeData.reset(), AnalyzeData.constructCrowdDataStructures(self.stream_settings['data_iterator'])
        fig = plt.figure(); ax = fig.gca()
#        expectedTags = set(['#redsox', '#mlb', '#sfgiants', '#49ers', '#mariners', '#twins', '#springtraining', '#mets', '#reds'])
#        expectedTags = set(['#ctia']); title = 'CTIA 2011'
#        expectedTags = set(['#55', '#hcr', '#hcrbday', '#oklahomas', '#aca', '#hcworks', '#npr', '#teaparty'])
#        expectedTags = set(['#budget11', '#taxdodgers', '#budget', '#pmqs', '#budget11', '#indybudget'])
#        expectedTags = set(['#egypt2dc', '#libyan', '#yemen', '#egypt', '#syria', '#gaddaficrimes', '#damascus', '#jan25', 
#                '#daraa', '#feb17', '#gaddafi', '#libya', '#feb17', '#gadhafi', '#muslimbrotherhood', '#gaddafis']); title = 'Middle East'
        expectedTags = set(['#libya']); title = 'Libya'
        for crowd in self._filteredCrowdIterator():
            if expectedTags.intersection(set(list(crowd.hashtagDimensions))):
                x, y = zip(*[(datetime.fromtimestamp(clusterGenerationTime), len(crowd.clusters[clusterGenerationTime].documentsInCluster)) for clusterGenerationTime in sorted(crowd.clusters)])
                plt.plot_date(x, y, '-', color=GeneralMethods.getRandomColor(), lw=2, label=' '.join([crowd.crowdId]+list(crowd.hashtagDimensions)[:1]))
        fig.autofmt_xdate(rotation=30)
        ax.xaxis.set_major_locator(matplotlib.dates.HourLocator(interval=24))
        ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%a %d %b'))
#        plt.legend()
        plt.xlim((datetime(2011, 3, 19), datetime(2011, 3, 30)))
        plt.title(getLatexForString('Crowds for '+title))
        plt.ylabel(getLatexForString('Crowd size'))
        plt.show()
def iteratePhrases():
    for tweet in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/tweets_by_trends/2011_2_6.gz'):
        message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **settings)
        if message.vector:
            for phrase in message.vector: 
                if phrase!='': yield (phrase, GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(message.timeStamp), 60))
Example #37
0
 def append(self, cluster, clusterFormationTime):
     self.clusters[GeneralMethods.getEpochFromDateTimeObject(
         clusterFormationTime)] = cluster
def copy_file(input_file, output_file):
    command = 'cp %s %s' % (input_file, output_file)
    GeneralMethods.runCommand(command)