Example #1
0
    def measureRankingQuality(iterationData=None, experimentFileName=None):
        #        def getTopTopics(model, noOfTopics):
        #            topics = set()
        #            topTopics = model.topTopics[:]
        #            while True:
        #                topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics])
        #                topic = topTopics[topicIndex][0].id
        #                del topTopics[topicIndex]
        #                if topic not in topics: topics.add(topic)
        #                if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break
        #            return [(t, 0) for t in topics]

        if iterationData:
            currentTimeStep, model, _, _, finalCall, conf = iterationData
            if not finalCall:
                rankingMethods = conf["rankingMethods"]
                experimentFileName = conf["experimentFileName"]
                topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[
                    :10
                ]
                #                topTopics = getTopTopics(model, 10)
                #                topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5))
                #                topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5))
                iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)}
                for rankingMethod in rankingMethods:
                    for queryTopic, _ in topTopics:
                        ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf)
                        #                        if spammness(messages, norm_k)==0:
                        #                            print 'c'
                        #                        print rankingMethod, spammness(messages, norm_k)
                        iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k))
                #                        print ranking_id, spammness(messages, norm_k)
                FileIO.writeToFileAsJson(iterationData, experimentFileName)
                model.topicsDistributionInTheTimeSet = defaultdict(int)
Example #2
0
    def analyzeQuality(graphs, graphType):
        def getQualityScore(graphMap, edgesToKeep, timeDifference):
            dataToReturn = []
            for j, intervalInSeconds in enumerate([1]):
                intervalInSeconds*=timeDifference
                linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep)
                logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep)
                linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters)
                print intervalInSeconds, edgesToKeep, score
                dataToReturn.append(score)
            return dataToReturn
        graphFile = qualityMetricsFolder%graphType
        print graphFile
        GeneralMethods.runCommand('rm -rf %s'%graphFile)
        for edgesToKeep in range(1,11): 
#        for edgesToKeep in [1,10]: 
            edgesToKeep*=0.1
            graphMap = dict(graphs[:])
            startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys())
            timeDifference = endingGraphId-startingGraphId
            LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep)
#            print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}
            FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
Example #3
0
    def trendCurves(iterationData=None, experimentFileName=None):
        if iterationData: 
            currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData
            experimentFileName = conf['experimentFileName']
            if not finalCall:
                topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics)
#                print currentTimeStep
                FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName)
            else:
                iterationInfo  = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold],
                      'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics),
                      'conf': conf}
                del iterationInfo['conf']['spamDectectionMethod']
                FileIO.writeToFileAsJson(iterationInfo, experimentFileName)
        else:
            topicsDataX = defaultdict(list)
            topicsDataY = defaultdict(list)
            for data in FileIO.iterateJsonFromFile(experimentFileName):
                if 'conf' not in data:
                    for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep'])
                else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics']
            for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0)
            plt.figure()
            for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0)
            plt.ylabel('Number of Contents', fontsize=16, fontweight='bold')
            plt.show()
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
Example #5
0
    def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag):
        for model_id in models_ids:
#            if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
#            else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag)
            output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
            GeneralMethods.runCommand('rm -rf %s'%output_file)
            for line_count, location_object in enumerate(iterateJsonFromFile(
                     location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                     )):
                print line_count, model_id
                tuo_neighbor_location_and_pure_influence_score = []
                location_hashtag_set = set(location_object['hashtags'])
                for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems():
                    pure_influence_scores = []
                    for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems():
                        if hashtag in location_object['hashtags']:
                            location_occurrences = location_object['hashtags'][hashtag][0]
                            pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences))
                    neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys())
                    if hashtag_tag==w_extra_hashtags_tag:
                        for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0)
                        for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0)
                    mean_pure_influence_score = np.mean(pure_influence_scores)
                    tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score])
                tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1))
                FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
def build(numberOfTimeUnits=24):
    validLattices = set()
    for data in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%('world','%s_%s'%(2,11))): validLattices.add(data['id'])
    documents, lattices = [], set()
    for h in FileIO.iterateJsonFromFile(hashtagsFile%('training_world','%s_%s'%(2,11))): 
        hashtag, document = Hashtag(h), []
        if hashtag.isValidObject():
            for timeUnit, occs in enumerate(hashtag.getOccrancesEveryTimeWindowIterator(HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)):
                occs = filter(lambda t: t[0] in validLattices, occs)
                occs = sorted(occs, key=itemgetter(0))
                if occs: 
                    for lattice in zip(*occs)[0]: lattices.add(lattice)
                document.append([timeUnit, [(k, len(list(i))) for k, i in groupby(occs, key=itemgetter(0))]])
            if document: documents.append(document)
    lattices = sorted(list(lattices))
    print len(lattices)
    documents = [(d, TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(d)) for d in documents]
    documents = documents[:int(len(documents)*0.80)]
    for decisionTimeUnit in range(1, numberOfTimeUnits+1):
        for latticeCount, predictingLattice in enumerate(lattices):
            print decisionTimeUnit, latticeCount,
            inputVectors, outputValues = [], []
            for rawDocument, processedDocument in documents:
                documentForTimeUnit = TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(rawDocument[:decisionTimeUnit])
                if documentForTimeUnit and processedDocument:
                    vector =  [documentForTimeUnit.get(l, 0) for l in lattices]
                    inputVectors.append(vector), outputValues.append(float(processedDocument.get(predictingLattice, 0)))
#            TargetSelectionRegressionClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues))
            TargetSelectionRegressionSVMRBFClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues))
Example #7
0
 def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder):
     def location_similarity(location_vector_1, location_vector_2): 
         return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.)
     influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]
     for model_id in model_ids:
         mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id))
         GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)
         for line_count, location_object in enumerate(iterateJsonFromFile(
                      location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                  )):
             print line_count
             location = location_object['id']
             tuo_neighbor_location_and_mf_influence_type_and_similarity = []
             for neighbor_location in location_object['links'].keys(): 
                 mf_influence_type_and_similarity = {}
                 for influence_type in influence_types:
                     similarity = location_similarity( 
                                                          mf_location_to_mf_influence_type_to_influence_vector[location][influence_type],
                                                          mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type]
                                                   )
                     mf_influence_type_and_similarity[influence_type] = similarity
                 so_hashtags_for_location = set(location_object['hashtags'].keys())
                 so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys())
                 numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0.
                 denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0.
                 mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator                
                 tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity])
             FileIO.writeToFileAsJson(
                                      [location, tuo_neighbor_location_and_mf_influence_type_and_similarity],
                                      tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id
                                      )
 def generate(self):
     i=0
     for tweet in TwitterIterators.iterateTweetsFromExperts(): 
         FileIO.writeToFileAsJson(tweet, self.fileName)
         i+=1
         if i==self.length: break;
     os.system('gzip %s'%self.fileName)
 def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]: 
         print 'Generating stats for: ',i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 
                                   'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                   TweetsFile.mr_stats_file)
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
Example #11
0
    def generate_data_for_significant_nei_utm_ids():
        output_file = GeneralMethods.get_method_id()+'.json'
        so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
        for utm_object in \
                FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
            for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
            mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                            utm_object['mf_nei_utm_id_to_common_h_count'].keys()
        hashtags = sorted(list(so_hashtags))
        mf_utm_id_to_vector = {}
        for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
#                print i, utm_object['utm_id']
            utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                 hashtags)
            mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
        for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
            print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
            ltuo_utm_id_and_vector = [(utm_id, vector)]
            for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
                if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
                    ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
            dfm_dict = cjson.decode(df_utm_vectors_json)
            mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
            utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
            dfm_dict['prediction_variable'] = utm_id_colname
            dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
                                                     df_utm_vectors.colnames)
            dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
            FileIO.writeToFileAsJson(dfm_dict, output_file)
Example #12
0
 def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids):
     for test_model_id in test_models_ids:
         output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id)
         GeneralMethods.runCommand('rm -rf %s'%output_file)
         ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time()
         for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\
                 enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time):
             ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1]))
                                                     for location, ito_location_and_occurrence_time in
                                                         groupby(
                                                                 sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)),
                                                                 key=itemgetter(0)
                                                         )
                                                 ] 
             print hashtag_count, test_model_id
             ltuo_location_and_pure_influence_score = []
             for location, location_occurrence_times in ltuo_location_and_occurrence_times:
                 pure_influence_scores = []
                 for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times:
                     if location!=neighbor_location:
                         pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times)
                         pure_influence_scores.append(pure_influence_score)
                 ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)])
             ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1))
             FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
 def plotQualityWithKMeansAndSSA():
     del plotSettings["ssa_mr"]
     speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats["ssa"]:
                 speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         del speedStats[k]["f1"]
     speedStats.update(dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in kMeansPlotSettings]))
     k = "k_means"
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for metric in speedStats["k_means"]:
             speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         if "f1" in speedStats[k]:
             del speedStats[k]["f1"]
     dataForPlot = dict([(k, []) for k in speedStats])
     for k in speedStats:
         for k1 in speedStats[k]:
             dataForPlot[k] += [np.mean(speedStats[k][k1])]
     #        del dataForPlot['k_means']
     print dataForPlot
     ind, width = np.arange(2), 0.1
     rects, i = [], 1
     plotSettings.update(kMeansPlotSettings)
     for k in dataForPlot:
         rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"]))
         i += 1
     plt.ylabel(getLatexForString("Score"))
     plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA"))
     plt.xticks(ind + 2 * width, ("$Purity$", "$NMI$"))
     plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4)
     #        plt.show()
     plt.savefig("qualityComparisonAll.pdf")
Example #14
0
def drawAllCheckinPlotsByVisitingClassesUsingDemography(model, **conf):
    plotsFolder = conf['plotsFolder']+'byVisitingClassesUsingDemography/'
    for locationId, location in model.locationsCheckinsMap.iteritems():
        if location['checkins']: 
            locationObject = Location.getObjectFromDict(location['object'])
            plotsFile = '%s%s/%s'%(plotsFolder, Location.getLocationClassBasedOnVisitingProbability(locationObject),locationId+'.png')
            FileIO.createDirectoryForFile(plotsFile)
            checkinsByBinsAndDemographies = defaultdict(dict)
            demographColorMap = {}
            for day, binData in location['checkins'].iteritems():
                for bin, checkins in binData.iteritems():
                    bin=int(bin)
                    for user in checkins:
                        demographyId = model.userMap[user]['object']['demography_id']
                        demographColorMap[demographyId] = model.userMap[user]['object']['demography_color']
                        if bin not in checkinsByBinsAndDemographies[demographyId]: checkinsByBinsAndDemographies[demographyId][bin]=0
                        checkinsByBinsAndDemographies[demographyId][bin]+=1
#            for bin in checkinsByBinsAndDemographies:
#                for demographyId in demographColorMap:
#                    plt.scatter([bin], [checkinsByBinsAndDemographies[bin][demographyId]], color=demographColorMap[demographyId])
            for demographyId, data in checkinsByBinsAndDemographies.iteritems():
#                print smooth([data[k] for k in sorted(data)], 4)
                plt.fill_between(sorted(data.keys()), smooth([data[k] for k in sorted(data)], 10)[:len(data)], color=demographColorMap[demographyId], alpha=0.65)
#               plt.hist([k for k, v in checkinsByBins.iteritems() for i in range(v)], conf['noOfBinsPerDay'], normed=True)
            plt.title(str(locationObject.visitingProbability))
            plt.savefig(plotsFile)
            plt.clf()
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters,
         currentTime - previousTime,
         {
             "type": experts_twitter_stream_settings["dimensions_performance_type"],
             "dimensions": experts_twitter_stream_settings["dimensions"],
         },
     )
     iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap)
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file)
     del iteration_data["clusters"]
     print currentMessageTime, iteration_data
     if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[
         "dimensions"
     ] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap):
         raise Exception
Example #16
0
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences):
#    output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
    output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

    print PARAMS_DICT
#    runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300})
    runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
Example #17
0
def getLocationDistributionPlots(place):
    for clustering in iteraterUserClusterings(place):
        for location in locationToUserMapIterator(place): 
            print clustering[0], location['location']
            fileName=placesImagesFolder%place['name']+str(clustering[0])+'/'+ location['location'].replace(' ', '_').replace('.', '+')+'.png'
            FileIO.createDirectoryForFile(fileName)
            getPerLocationDistributionPlots(clustering, location, fileName)
Example #18
0
def generateRadiusSpots(radiusInMiles):
    graph = nx.Graph()
    spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles)
    print 'Creating:', spotsFile
    for lid in locationIterator():
        for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid)
    for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)
 def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]: 
         for j in range(1, 10):
             print 'Generating stats for: ',i*j
             tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 
                                       'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                       TweetsFile.default_stats_file)
Example #21
0
 def writeHashtagsFile():
     hashtags = []
     for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'):
         print hashtagObject.keys()
         exit()
         hashtags.append(hashtagObject['h'])
     hashtags=sorted(hashtags)
     for h in hashtags: FileIO.writeToFile(unicode(h).encode('utf-8'), 'hashtags')
Example #22
0
def writeCheckinSequenceGraphFile():   
    userSet = set([userVector['user'] for userVector in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)])
    count, total = 1, len(userSet)
    for user in userSet:
        print user, count, total
        checkins = [(c['_id'], c['lid'], time.mktime(c['t'].timetuple())) for c in checkinsCollection.find({'u': user})]
        for i in GeneralMethods.getElementsInWindow(checkins, 2): FileIO.writeToFileAsJson([user, i], checkinSequenceGraphFile)
        count+=1
Example #23
0
 def run_job_on_hashtags_in_dfs(mr_class, output_file):
     job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
     print 'Running map reduce with the following params:'
     pprint(PARAMS_DICT)
     print 'Hadoop job conf:'
     pprint(job_conf)
     runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf)
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print "Generating stats for: ", length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)}
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
Example #26
0
    def run():
        for graphType, method in [\
#                                  (RandomGraphGenerator.fast_gnp_random_graph, RandomGraphGenerator.fastGnp),
#                                  (RandomGraphGenerator.erdos_renyi_graph, RandomGraphGenerator.erdosRenyi),
#                                  (RandomGraphGenerator.newman_watts_strogatz_graph, RandomGraphGenerator.nWS),
                                (RandomGraphGenerator.powerlaw_cluster_graph, RandomGraphGenerator.powerlawClusterGraph),
                                  ]:
            for i in range(1, 11): FileIO.writeToFileAsJson({'n': 100*i, 'graphs': method(1000*i)}, randomGraphsFolder%graphType)
Example #27
0
def generateLocationClusterData():
#    p = Pool()
#    totalLocations = len(list(locationClusterIterator()))
#    i=1
    for location in locationClusterIterator():
        location = clusterLocation(location)
#        print '%s of %s'%(i,totalLocations)
        FileIO.writeToFileAsJson(location, locationClustersFile)
Example #28
0
 def writeClusterKML():
     kml = SpotsKML()
     outputKMLFile='%s/clusters.kml'%placesAnalysisFolder%place['name']
     for data in FileIO.iterateJsonFromFile(placesUserClusterFeaturesFile%place['name']):
         clusterId, color, features = data
         kml.addLocationPointsWithTitles([(getLocationFromLid(f[0].replace('_', ' ')), f[2]) for f in features[:noOfFeatures]], color=color)
         FileIO.createDirectoryForFile(outputKMLFile)
         kml.write(outputKMLFile)
Example #29
0
 def writeUserClustersFile(place):
     print 'Generating clusters...'
     userVectors = GenerateDataFiles.getUserVectors(place)
     GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name'])
     clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1')
 #    clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2')
     for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]}
     for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
 def run_job(mr_class, output_file, input_files_start_time, input_files_end_time):
     PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple())
     PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple())
     print 'Running map reduce with the following params:', pprint(PARAMS_DICT)
     runMRJob(mr_class,
              output_file,
              MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time),
              jobconf={'mapred.reduce.tasks':500})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
Example #31
0
 def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010, 11, 1),
                              houstonDataEndTime=datetime(2011, 5, 30)):
     currentTime = houstonDataStartTime
     while currentTime <= houstonDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(
                 houston_twitter_stream_settings.twitter_users_tweets_folder
                 + '%s.gz' % FileIO.getFileByDay(currentTime)):
             yield tweet
         currentTime += timedelta(days=1)
Example #32
0
def getStatsForSSA():
    batchSize = 10000
    default_experts_twitter_stream_settings['ssa_threshold'] = 0.75
    for id in range(21, 50):
        fileName = time_to_process_points + '%s/%s' % (batchSize, id)
        ts = time.time()
        sstObject = SimilarStreamAggregation(
            dict(iterateUserDocuments(fileName)),
            default_experts_twitter_stream_settings['ssa_threshold'])
        sstObject.estimate()
        #    documentClusters = list(sstObject.iterateClusters())
        iteration_data = {
            'iteration_time': time.time() - ts,
            'type': 'ssa',
            'number_of_messages': batchSize * (id + 1),
            'batch_size': batchSize
        }
        FileIO.writeToFileAsJson(iteration_data, ssa_stats_file)
def loadExperimentsData(experimentsData, file):
    for data in FileIO.iterateJsonFromFile(file):
        if data['purity'] > 0 and data['purity'] < 1:
            experimentsData[data['iteration_parameters']
                            ['type']]['iteration_time'].append(
                                data['iteration_time'])
            experimentsData[data['iteration_parameters']
                            ['type']]['quality'].append(data['purity'])
            experimentsData[data['iteration_parameters']
                            ['type']]['total_clusters'].append(
                                data['iteration_parameters']['total_clusters'])
Example #34
0
    def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
        def calculateDimensionsFor(params, percentageOfNewDimensions):
            '''
            numberOfTimeUnits=10*24*12
            Experts stream [  1.17707899e+03   1.03794580e+00] 76819
            Houston stream [  2.73913900e+03   1.02758516e+00] 195731
            '''
            print getSmallestPrimeNumberGreaterThan(
                int(
                    CurveFit.inverseOfDecreasingExponentialFunction(
                        params, percentageOfNewDimensions)))

        dataDistribution = defaultdict(list)
        for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
            for k, v in line[
                    ParameterEstimation.dimensionsEstimationId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0., 0.]
                dataDistribution[k][0] += v
                dataDistribution[k][1] += 1
        x, y = [], []
        [(x.append(k),
          y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k))
         for k in sorted(dataDistribution) if k > 1000]
        x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateDimensionsFor(
                exponentialCurveParams, 0.01)
        plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(
            getLatexForString('\# of dimensions')
        ), plt.title(
            getLatexForString(
                'Dimension stability with increasing number of dimensions.'))
        plt.semilogy(
            x,
            y,
            'o',
            color=self.stream_settings['plot_color'],
            label=getLatexForString(self.stream_settings['plot_label']) +
            getLatexForString(' (%0.2fx^{-%0.2f})') %
            (exponentialCurveParams[0], exponentialCurveParams[1]),
            lw=2)
        plt.semilogy(x,
                     CurveFit.getYValues(
                         CurveFit.decreasingExponentialFunction,
                         exponentialCurveParams, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)
        plt.legend()
        if returnAxisValuesOnly: plt.show()
Example #35
0
    def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
        '''
        This determines the time after which a cluster can be considered 
        decayed and hence removed.
        
        Experts stream [ 0.66002386  0.07035227] 0.1 82
        Houston stream [ 0.73800037  0.05890473] 0.1 29
        
        458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
        71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
        
        '''
        def calculateInActivityTimeFor(params, probabilityOfInactivity):
            return int(
                CurveFit.inverseOfIncreasingExponentialFunction(
                    params, 1 - probabilityOfInactivity))

        data = list(
            FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings[
                '%s_file' %
                ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
        total = float(
            sum(data['lag_between_streams_added_to_cluster'].values()))
        x = sorted(
            map(int, data['lag_between_streams_added_to_cluster'].keys()))
        y = getCumulativeDistribution([
            data['lag_between_streams_added_to_cluster'][str(i)] / total
            for i in x
        ])
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateInActivityTimeFor(
                exponentialCurveParams, 0.2)
        plt.plot(x,
                 y,
                 'o',
                 label=getLatexForString(self.stream_settings['plot_label']) +
                 getLatexForString(' (%0.2fx^{%0.2f})') %
                 (exponentialCurveParams[0], exponentialCurveParams[1]),
                 color=self.stream_settings['plot_color'])
        plt.plot(x,
                 CurveFit.getYValues(CurveFit.increasingExponentialFunction,
                                     exponentialCurveParams, x),
                 color=self.stream_settings['plot_color'],
                 lw=2)
        plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(
            getLatexForString(xlabelTimeUnits)), plt.title(
                getLatexForString('CDF for clusters lag distribution.'))
        plt.ylim((0, 1.2))
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
Example #36
0
 def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                **stream_settings):
     ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
     Run this on a document set of size 100K. 
     '''
     for length in [
             i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
     ]:
         #            for t in range(1, 16):
         for t in range(16, 21):
             stream_settings[
                 'threshold_for_document_to_be_in_cluster'] = t * 0.05
             print length, stream_settings[
                 'threshold_for_document_to_be_in_cluster']
             stats = {
                 'streaming_lsh':
                 KMeansTweetsFile(length, **stream_settings).
                 generateStatsForStreamingLSHClustering(),
                 'settings':
                 Settings.getSerialzedObject(stream_settings)
             }
             FileIO.writeToFileAsJson(stats, stats_file)
Example #37
0
 def writeARFFForClustering(data, relationName):
     keyToIdMap = {}
     fileName = '/tmp/' + relationName + '.arff'
     os.system('rm -rf %s' % fileName)
     for docId in sorted(data):
         docVector = data[docId]
         for k, v in docVector.iteritems():
             if k not in keyToIdMap: keyToIdMap[k] = len(keyToIdMap)
     FileIO.writeToFile(ARFF.getRelationLine(relationName), fileName)
     for attributeName in keyToIdMap:
         FileIO.writeToFile(ARFF.getAttributeLine(attributeName), fileName)
     FileIO.writeToFile('@data', fileName)
     for d in data.iteritems():
         FileIO.writeToFile(ARFF.getDataLine(d, keyToIdMap), fileName)
     return fileName
 def plotClusteringSpeed(saveFig=True):
     plotSettings = {
         'k_means': {
             'label': 'Iterative k-means',
             'color': '#FD0006'
         },
         'mr_k_means': {
             'label': 'MR k-means',
             'color': '#5AF522'
         },
         'streaming_lsh': {
             'label': 'Stream CDA',
             'color': '#7109AA'
         },
     }
     dataToPlot = {
         'k_means': {
             'x': [],
             'y': []
         },
         'mr_k_means': {
             'x': [],
             'y': []
         },
         'streaming_lsh': {
             'x': [],
             'y': []
         }
     }
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    dataToPlot[k]['y'],
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString('\# of documents'))
         plt.ylabel(getLatexForString('Running time (s)'))
         plt.title(
             getLatexForString(
                 'Running time comparsion for Streaing LSH with k-Means'))
     plt.xlim(xmin=800, xmax=100000)
     plt.xticks([])
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithKMeans.pdf')
Example #39
0
 def dimensionInActivityTimeEstimation(estimationObject,
                                       currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues(
     ):
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(
             currentMessageTime, phraseObject.latestOccuranceTime,
             estimationObject.stream_settings['time_unit_in_seconds'].
             seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
         'time_stamp':
         getStringRepresentationForTweetTimestamp(currentMessageTime),
         'settings':
         pprint.pformat(estimationObject.stream_settings),
         ParameterEstimation.dimensionInActivityTimeId:
         estimationObject.lagBetweenMessagesDistribution,
         'phrases_lag_distribution':
         phrasesLagDistribution
     }
     FileIO.writeToFileAsJson(iterationData,
                              estimationObject.dimensionInActivityTimeFile)
Example #40
0
 def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19),
                              expertsDataEndTime=datetime(2011, 4, 12)):
     experts = getExperts()
     currentTime = expertsDataStartTime
     while currentTime <= expertsDataEndTime:
         for tweet in TwitterIterators.iterateFromFile(
                 experts_twitter_stream_settings.twitter_users_tweets_folder
                 + '%s.gz' % FileIO.getFileByDay(currentTime)):
             if tweet['user']['id_str'] in experts:
                 if getDateTimeObjectFromTweetTimestamp(
                         tweet['created_at']) <= expertsDataEndTime:
                     yield tweet
                 else:
                     return
         currentTime += timedelta(days=1)
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject,
                                   currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         experts_twitter_stream_settings['cluster_filter_threshold']
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters, currentTime - previousTime, {
             'type':
             experts_twitter_stream_settings['trie_type'],
             'total_clusters':
             len(hdStreamClusteringObject.clusters),
             'current_time':
             getStringRepresentationForTweetTimestamp(currentMessageTime)
         })
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(
         currentMessageTime), iteration_data
Example #42
0
def iterateUserDocuments(fileName):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in FileIO.iterateJsonFromFile(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **default_experts_twitter_stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v
Example #43
0
def streamingLSHClusteringDemo():
    clustering_settings = {'dimensions': 53,
                            'signature_length': 13,
                            'number_of_permutations': 5,
                            'threshold_for_document_to_be_in_cluster': 0.2}
    clustering=StreamingLSHClustering(**clustering_settings)
    docId = 0
    docsToOriginalClusterMap = {}
    for line in FileIO.iterateLinesFromFile('../data/streaming.dat'):
        document = createDocumentFromLine(docId, line)
        docsToOriginalClusterMap[docId] = document.clusterId
        docId+=1
        clustering.getClusterAndUpdateExistingClusters(document)
    clusterLabels = []
    for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()])
    return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
 def plotICDFDimensionsInactivityThreshold(self, returnAxisValuesOnly=True):
     ''' Plot P(in_actiivty > threshold timeunit)
         Find time unit at which probability is low.
         Experts stream 0.25 129
         Houston stream 0.25 144
     '''
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile)):
         data = dict((int(k), v) for k,v in line[ParameterEstimation.dimensionInActivityTimeId].iteritems())
         total.append(sum(data.values()))
         for i in data: dataY[i].append(data[i]); dataX.add(i)
     totalInstancesObserved=float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x])
     plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for dimensions.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()
 def plotClusteringQuality():
     del plotSettings['ssa_mr']
     speedStats = dict([(k, {
         'f1': [],
         'nmi': [],
         'purity': []
     }) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats['ssa']:
                 speedStats[k][metric].append(data[k][metric])
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, [
                     '(%0.2f %0.2f)' % (np.mean(z), np.var(z))
                     for z in zip(*v1)
                 ]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(
             plt.bar(ind + i * width,
                     dataForPlot[k],
                     width,
                     color=plotSettings[k]['color']))
         i += 1
     plt.ylabel(getLatexForString('Score'))
     plt.title(
         getLatexForString(
             'Clustering quality comparison for Streaming LSH with SSA'))
     plt.xticks(ind + width,
                ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$'))
     plt.legend([r[0] for r in rects],
                [plotSettings[k]['label'] for k in plotSettings],
                loc=4)
     #        plt.show()
     plt.savefig('qualityComparisonWithSSA.pdf')
 def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True):
     '''
     This gives us the percentage of phrases we can loose everytime we prune phrases.
     
     Measures the percentage of dimensions having lag less than TU.
     
     So at the end of 10th day, almost y% of phrases can be removed. With some probabiity
     that it will not occure again.
     
     numberOfTimeUnits=10*24*12
     With 75% probability.
     Experts stream [ 0.0097055   0.81888514] 107 0.554497397565
     Houston stream [ 0.00943499  0.825918  ] 126 0.487757815615
     With 90% probability.
     Experts stream [ 0.0097055   0.81888514] 223 0.187150798756
     Houston stream [ 0.00943499  0.825918  ] 228 0.164007589276
     '''
     def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
     dataDistribution = {}
     currentTimeUnit = 0
     for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]:
         totalDimensions = float(sum(data['phrases_lag_distribution'].values()))
         tempArray = []
         for k, v in data['phrases_lag_distribution'].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
             dataDistribution[k][currentTimeUnit] = v / totalDimensions
             tempArray.append(v / totalDimensions)
         currentTimeUnit += 1
     x = sorted(dataDistribution)
     y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
     params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], params,
     def subPlot(id, timeUnit):
         plt.subplot(id)
         print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
         plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
         plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag'))
     else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$')
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
 def plotICDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Experts stream 0.25 199
     Houston stream 0.25 152
     '''
     self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = self.stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(FileIO.iterateJsonFromFile(self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])):
         print line.keys()
         data = dict((int(k), v) for k,v in line[ClusteringParametersEstimation.clusterLagDistributionId].iteritems())
         total.append(sum(data.values()))
         for i in data: dataY[i].append(data[i]); dataX.add(i)
     totalInstancesObserved=float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x])
     plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['plot_label']=='Houston stream': plt.plot([0,x[-1]], [1, 0], '--', color='#5AF522', lw=2)
     plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for crowds.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()
 def plotThresholdForDocumentToBeInCluster(self, statsFile):
     dataToPlot = dict(('%0.2f' % (t * 0.05), {'iteration_time':[], 'purity': [], 'nmi': []}) for t in range(1, 21))
     for data in FileIO.iterateJsonFromFile(statsFile):
         threshold = '%0.2f' % data['settings']['threshold_for_document_to_be_in_cluster']
         for k in dataToPlot[threshold]: dataToPlot[threshold][k] += [data['streaming_lsh'][k]]
     for t in dataToPlot:
         for k in dataToPlot[t]: dataToPlot[t][k] = np.mean(dataToPlot[t][k]) 
     dataX = sorted([float(i) for i in dataToPlot])[:-1]
     print dataX
     # Plot iteration time.
     plt.subplot(211)
     plt.plot(dataX, [dataToPlot['%0.2f' % x]['iteration_time'] for x in dataX], lw=2, color='k')
     plt.ylabel(getLatexForString('Time (s)'))
     plt.title(getLatexForString('Estimation of \epsilon^\prime for Stream SSA'))
     plt.subplot(212)
     for metric, label, color in [('nmi', 'NMI', '#F60018'), ('purity', 'Purity', '#25D500')]: plt.plot(dataX, [dataToPlot['%0.2f' % x][metric] for x in dataX], label=label, color=color, lw=2)
     plt.ylabel(getLatexForString('Score'))
     plt.xlabel(getLatexForString('Similarity threshold (\epsilon^\prime)'))
     plt.legend(loc=4)
     plt.show()
 def plotJustifyDimensionsEstimation(self):
     runningTimeData, purityData = defaultdict(list), defaultdict(list)
     for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file):
         if data['iteration_parameters']['dimensions']<data['no_of_observed_dimensions']:
             no_of_dimensions = data['iteration_parameters']['dimensions']
             runningTimeData[no_of_dimensions].append(data['iteration_time']), purityData[no_of_dimensions].append(data['purity'])
     plt.subplot(111)
     dataX, dataY = [], []
     del purityData[169991]; del purityData[39989]
     plt.title('Impact of dimension estimation')
     for k in sorted(purityData): dataX.append(k), dataY.append(np.mean(purityData[k])) 
     plt.semilogx(dataX, [0.96]*len(dataX), '--', label='Top n dimensions', color='#7109AA', lw=2)
     plt.semilogx(dataX, [np.mean(dataY)]*len(dataX), '--', color='#5AF522', lw=2)
     plt.semilogx(dataX, dataY, '-x', label='Fixed dimensions', color='#5AF522', lw=2)
     plt.ylim(0.8, 1.0)
     plt.xlim(7000, 203000)
     plt.xlabel('# of dimensions')
     plt.ylabel('Purity')
     plt.legend(loc=3)
     plt.savefig('justifyDimensionsEstimation.pdf')
 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()
    def plotJustifyDimensionsEstimation2(self):
        pltInfo =  {JustifyDimensionsEstimation.top_n_dimension: {'label': 'Temporally significant', 'color': '#7109AA', 'type': '-', 'marker': 'x'}, JustifyDimensionsEstimation.first_n_dimension: {'label': 'By occurrence', 'color': '#5AF522', 'type': '-', 'marker': 'o'}}
#        experimentsData = {JustifyMemoryPruning.with_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyMemoryPruning.without_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
        experimentsData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(dict), JustifyDimensionsEstimation.first_n_dimension: defaultdict(dict)}
        for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file_2):
#        for data in FileIO.iterateJsonFromFile('temp/dimensions_need_analysis_2'):
#            if 'dimensions' in data['iteration_parameters']: 
            dimension = data['iteration_parameters']['dimensions']
            type = data['iteration_parameters']['type']
            if dimension not in experimentsData[type]: experimentsData[type][dimension] = {'iteration_time': [], 'quality': [], 'total_clusters': []}
            experimentsData[type][dimension]['iteration_time'].append(data['iteration_time']), experimentsData[type][dimension]['quality'].append(data['purity']), experimentsData[type][dimension]['total_clusters'].append(data['no_of_clusters'])
        lshData = dict([(k, np.mean(experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819][k])) for k in experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]])
        del experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]
        print lshData
        plotData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(list), JustifyDimensionsEstimation.first_n_dimension: defaultdict(list)}
        for type in experimentsData:
            for dimension in sorted(experimentsData[type]): plotData[type]['dataX'].append(dimension); [plotData[type][k].append(np.mean(experimentsData[type][dimension][k])) for k in experimentsData[type][dimension]]
        plt.subplot(311); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['total_clusters'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['total_clusters']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=1)
        
        plt.subplot(312); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['iteration_time'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['iteration_time']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819'), lw=2);
        plt.ylim(ymin=1, ymax=1500)
        plt.legend(loc=2, ncol=2)
        plt.subplot(313); 
        for type in experimentsData:
            plt.plot([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['quality'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2, marker=pltInfo[type]['marker']);
        plt.ylabel('Mean purity per iteration', fontsize=20); 
#        plt.title(getLatexForString('Impact of dimension ranking'))
        plt.xlabel('# number of dimensions $(10^3)$', fontsize=20)
#        plt.plot([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['quality']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=0.80,ymax=1.0)
        plt.legend()
        plt.savefig('justifyDimensionsEstimation2.png')
    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()
 def plotClusteringSpeed(saveFig=True):
     dataToPlot = dict([(k, {'x': [], 'y': []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    movingAverage(dataToPlot[k]['y'], 1),
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     print dataToPlot['streaming_lsh']['x'][10]
     print dataToPlot['streaming_lsh']['y'][10]
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString('\# of documents'))
         plt.ylabel(getLatexForString('Running time (s)'))
         plt.title(
             getLatexForString(
                 'Running time comparsion for Streaing LSH with SSA'))
     plt.xlim(xmin=500, xmax=600000)
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithSSA.pdf')
Example #54
0
def getIterator(id):
    for line in FileIO.iterateJsonFromFile(time_to_process_points +
                                           'stats/%s' % id):
        yield line
Example #55
0
def fileIterator():
    for id in xrange(20):
        yield FileIO.iterateJsonFromFile(time_to_process_points + '%s' % id)
Example #56
0
 def kmeans():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_folder + 'combined_stats_file'):
         yield data['k_means']
Example #57
0
 def kmeansmr():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_folder + 'mr_quality_stats'):
         yield data['mr_k_means']
Example #58
0
 def cdait():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_ssa_folder + 'quality_stats'):
         if 'ssa' in data: yield data['ssa']
Example #59
0
 def cdamr():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_ssa_folder + 'quality_stats'):
         yield data['ssa_mr']
Example #60
0
 def unoptimized():
     for data in FileIO.iterateJsonFromFile(
             hd_clustering_performance_folder + 'cda_unopt'):
         yield data['streaming_lsh']