def measureRankingQuality(iterationData=None, experimentFileName=None): # def getTopTopics(model, noOfTopics): # topics = set() # topTopics = model.topTopics[:] # while True: # topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics]) # topic = topTopics[topicIndex][0].id # del topTopics[topicIndex] # if topic not in topics: topics.add(topic) # if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break # return [(t, 0) for t in topics] if iterationData: currentTimeStep, model, _, _, finalCall, conf = iterationData if not finalCall: rankingMethods = conf["rankingMethods"] experimentFileName = conf["experimentFileName"] topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[ :10 ] # topTopics = getTopTopics(model, 10) # topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5)) # topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5)) iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)} for rankingMethod in rankingMethods: for queryTopic, _ in topTopics: ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf) # if spammness(messages, norm_k)==0: # print 'c' # print rankingMethod, spammness(messages, norm_k) iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k)) # print ranking_id, spammness(messages, norm_k) FileIO.writeToFileAsJson(iterationData, experimentFileName) model.topicsDistributionInTheTimeSet = defaultdict(int)
def analyzeQuality(graphs, graphType): def getQualityScore(graphMap, edgesToKeep, timeDifference): dataToReturn = [] for j, intervalInSeconds in enumerate([1]): intervalInSeconds*=timeDifference linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep) logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep) linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))] logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))] score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters) print intervalInSeconds, edgesToKeep, score dataToReturn.append(score) return dataToReturn graphFile = qualityMetricsFolder%graphType print graphFile GeneralMethods.runCommand('rm -rf %s'%graphFile) for edgesToKeep in range(1,11): # for edgesToKeep in [1,10]: edgesToKeep*=0.1 graphMap = dict(graphs[:]) startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys()) timeDifference = endingGraphId-startingGraphId LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep) # print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))} FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
def trendCurves(iterationData=None, experimentFileName=None): if iterationData: currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData experimentFileName = conf['experimentFileName'] if not finalCall: topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics) # print currentTimeStep FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName) else: iterationInfo = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold], 'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics), 'conf': conf} del iterationInfo['conf']['spamDectectionMethod'] FileIO.writeToFileAsJson(iterationInfo, experimentFileName) else: topicsDataX = defaultdict(list) topicsDataY = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): if 'conf' not in data: for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep']) else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics'] for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0) plt.figure() for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0) plt.ylabel('Number of Contents', fontsize=16, fontweight='bold') plt.show()
def dimensionsEstimation(estimationObject, currentMessageTime): ''' This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate the number of phrases that need to added every iteration for different dimensions. The dimension at which the number of phrases added stablizes is the number of dimensions for the stream. Why do we need this? The aim is to get dimensions, that dont change too often at the same time are not very huge. This experiments gives us an approximate idea of the number of dimensions. Randomly picking a small value will result in dimensions that are not good and picking too big a value will result in inefficiency. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)] oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration if estimationObject.topDimensionsDuringPreviousIteration: dimensions_estimation = {} for boundary in estimationObject.boundaries: if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary])) print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': estimationObject.stream_settings.convertToSerializableObject(), ParameterEstimation.dimensionsEstimationId:dimensions_estimation } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile) estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag): for model_id in models_ids: # if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) # else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag) output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) GeneralMethods.runCommand('rm -rf %s'%output_file) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count, model_id tuo_neighbor_location_and_pure_influence_score = [] location_hashtag_set = set(location_object['hashtags']) for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems(): pure_influence_scores = [] for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems(): if hashtag in location_object['hashtags']: location_occurrences = location_object['hashtags'][hashtag][0] pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences)) neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys()) if hashtag_tag==w_extra_hashtags_tag: for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0) for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0) mean_pure_influence_score = np.mean(pure_influence_scores) tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score]) tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
def build(numberOfTimeUnits=24): validLattices = set() for data in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%('world','%s_%s'%(2,11))): validLattices.add(data['id']) documents, lattices = [], set() for h in FileIO.iterateJsonFromFile(hashtagsFile%('training_world','%s_%s'%(2,11))): hashtag, document = Hashtag(h), [] if hashtag.isValidObject(): for timeUnit, occs in enumerate(hashtag.getOccrancesEveryTimeWindowIterator(HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)): occs = filter(lambda t: t[0] in validLattices, occs) occs = sorted(occs, key=itemgetter(0)) if occs: for lattice in zip(*occs)[0]: lattices.add(lattice) document.append([timeUnit, [(k, len(list(i))) for k, i in groupby(occs, key=itemgetter(0))]]) if document: documents.append(document) lattices = sorted(list(lattices)) print len(lattices) documents = [(d, TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(d)) for d in documents] documents = documents[:int(len(documents)*0.80)] for decisionTimeUnit in range(1, numberOfTimeUnits+1): for latticeCount, predictingLattice in enumerate(lattices): print decisionTimeUnit, latticeCount, inputVectors, outputValues = [], [] for rawDocument, processedDocument in documents: documentForTimeUnit = TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(rawDocument[:decisionTimeUnit]) if documentForTimeUnit and processedDocument: vector = [documentForTimeUnit.get(l, 0) for l in lattices] inputVectors.append(vector), outputValues.append(float(processedDocument.get(predictingLattice, 0))) # TargetSelectionRegressionClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues)) TargetSelectionRegressionSVMRBFClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues))
def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder): def location_similarity(location_vector_1, location_vector_2): return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.) influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] for model_id in model_ids: mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id)) GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count location = location_object['id'] tuo_neighbor_location_and_mf_influence_type_and_similarity = [] for neighbor_location in location_object['links'].keys(): mf_influence_type_and_similarity = {} for influence_type in influence_types: similarity = location_similarity( mf_location_to_mf_influence_type_to_influence_vector[location][influence_type], mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type] ) mf_influence_type_and_similarity[influence_type] = similarity so_hashtags_for_location = set(location_object['hashtags'].keys()) so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys()) numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0. denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0. mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity]) FileIO.writeToFileAsJson( [location, tuo_neighbor_location_and_mf_influence_type_and_similarity], tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id )
def generate(self): i=0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, self.fileName) i+=1 if i==self.length: break; os.system('gzip %s'%self.fileName)
def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ',i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.mr_stats_file)
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime): ''' Observe the new dimensions that get added to current dimension if the dimensions are being updated at regular intervals. For example, number of dimensions being added after 10m, 20m,... 5 horus. As time increases the number of 'decayed' dimensions increase. The current dimensions has a lot of unwanted decayed dimensions. Using this information identify the time interval that is best suited to refresh dimensions. Tentative: We decide to pick the time interval at which the rate of decay is maximum. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject dimensions = estimationObject.stream_settings['dimensions'] newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions] print currentMessageTime, len(newList) if len(newList) >= dimensions: idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap] dimensionsUpdateFrequency = {} for td, id in idsOfDimensionsListToCompare: oldList = estimationObject.dimensionListsMap[id] dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList)) print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)] iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile) estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:] for key in estimationObject.dimensionListsMap.keys()[:]: if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
def generate_data_for_significant_nei_utm_ids(): output_file = GeneralMethods.get_method_id()+'.json' so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {} for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags = sorted(list(so_hashtags)) mf_utm_id_to_vector = {} for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector) for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()): print '%s of %s'%(i+1, len(mf_utm_id_to_vector)) ltuo_utm_id_and_vector = [(utm_id, vector)] for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]: if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id: ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id])) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors) dfm_dict = cjson.decode(df_utm_vectors_json) mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames)) utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id] dfm_dict['prediction_variable'] = utm_id_colname dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname, df_utm_vectors.colnames) dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0])) FileIO.writeToFileAsJson(dfm_dict, output_file)
def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids): for test_model_id in test_models_ids: output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1])) for location, ito_location_and_occurrence_time in groupby( sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)), key=itemgetter(0) ) ] print hashtag_count, test_model_id ltuo_location_and_pure_influence_score = [] for location, location_occurrence_times in ltuo_location_and_occurrence_times: pure_influence_scores = [] for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times: if location!=neighbor_location: pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times) pure_influence_scores.append(pure_influence_score) ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)]) ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
def plotQualityWithKMeansAndSSA(): del plotSettings["ssa_mr"] speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings]) for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file): for k in speedStats: for metric in speedStats["ssa"]: speedStats[k][metric].append(data[k][metric]) for k in speedStats: del speedStats[k]["f1"] speedStats.update(dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in kMeansPlotSettings])) k = "k_means" for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file): for metric in speedStats["k_means"]: speedStats[k][metric].append(data[k][metric]) for k in speedStats: if "f1" in speedStats[k]: del speedStats[k]["f1"] dataForPlot = dict([(k, []) for k in speedStats]) for k in speedStats: for k1 in speedStats[k]: dataForPlot[k] += [np.mean(speedStats[k][k1])] # del dataForPlot['k_means'] print dataForPlot ind, width = np.arange(2), 0.1 rects, i = [], 1 plotSettings.update(kMeansPlotSettings) for k in dataForPlot: rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"])) i += 1 plt.ylabel(getLatexForString("Score")) plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA")) plt.xticks(ind + 2 * width, ("$Purity$", "$NMI$")) plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4) # plt.show() plt.savefig("qualityComparisonAll.pdf")
def drawAllCheckinPlotsByVisitingClassesUsingDemography(model, **conf): plotsFolder = conf['plotsFolder']+'byVisitingClassesUsingDemography/' for locationId, location in model.locationsCheckinsMap.iteritems(): if location['checkins']: locationObject = Location.getObjectFromDict(location['object']) plotsFile = '%s%s/%s'%(plotsFolder, Location.getLocationClassBasedOnVisitingProbability(locationObject),locationId+'.png') FileIO.createDirectoryForFile(plotsFile) checkinsByBinsAndDemographies = defaultdict(dict) demographColorMap = {} for day, binData in location['checkins'].iteritems(): for bin, checkins in binData.iteritems(): bin=int(bin) for user in checkins: demographyId = model.userMap[user]['object']['demography_id'] demographColorMap[demographyId] = model.userMap[user]['object']['demography_color'] if bin not in checkinsByBinsAndDemographies[demographyId]: checkinsByBinsAndDemographies[demographyId][bin]=0 checkinsByBinsAndDemographies[demographyId][bin]+=1 # for bin in checkinsByBinsAndDemographies: # for demographyId in demographColorMap: # plt.scatter([bin], [checkinsByBinsAndDemographies[bin][demographyId]], color=demographColorMap[demographyId]) for demographyId, data in checkinsByBinsAndDemographies.iteritems(): # print smooth([data[k] for k in sorted(data)], 4) plt.fill_between(sorted(data.keys()), smooth([data[k] for k in sorted(data)], 10)[:len(data)], color=demographColorMap[demographyId], alpha=0.65) # plt.hist([k for k, v in checkinsByBins.iteritems() for i in range(v)], conf['noOfBinsPerDay'], normed=True) plt.title(str(locationObject.visitingProbability)) plt.savefig(plotsFile) plt.clf()
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): global evaluation, previousTime currentTime = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"] ] iteration_data = evaluation.getEvaluationMetrics( documentClusters, currentTime - previousTime, { "type": experts_twitter_stream_settings["dimensions_performance_type"], "dimensions": experts_twitter_stream_settings["dimensions"], }, ) iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap) previousTime = time.time() FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file) del iteration_data["clusters"] print currentMessageTime, iteration_data if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[ "dimensions" ] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap): raise Exception
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences): # output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) print PARAMS_DICT # runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300}) runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def getLocationDistributionPlots(place): for clustering in iteraterUserClusterings(place): for location in locationToUserMapIterator(place): print clustering[0], location['location'] fileName=placesImagesFolder%place['name']+str(clustering[0])+'/'+ location['location'].replace(' ', '_').replace('.', '+')+'.png' FileIO.createDirectoryForFile(fileName) getPerLocationDistributionPlots(clustering, location, fileName)
def generateRadiusSpots(radiusInMiles): graph = nx.Graph() spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles) print 'Creating:', spotsFile for lid in locationIterator(): for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid) for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)
def writeTweetsForDay(currentDay): fileName = houston_data_folder+FileIO.getFileByDay(currentDay) for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']): screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid']) if screenName!=None: data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}} FileIO.writeToFileAsJson(data, fileName) os.system('gzip %s'%fileName)
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ',i*j tf = TweetsFile(i*j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.default_stats_file)
def writeHashtagsFile(): hashtags = [] for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'): print hashtagObject.keys() exit() hashtags.append(hashtagObject['h']) hashtags=sorted(hashtags) for h in hashtags: FileIO.writeToFile(unicode(h).encode('utf-8'), 'hashtags')
def writeCheckinSequenceGraphFile(): userSet = set([userVector['user'] for userVector in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)]) count, total = 1, len(userSet) for user in userSet: print user, count, total checkins = [(c['_id'], c['lid'], time.mktime(c['t'].timetuple())) for c in checkinsCollection.find({'u': user})] for i in GeneralMethods.getElementsInWindow(checkins, 2): FileIO.writeToFileAsJson([user, i], checkinSequenceGraphFile) count+=1
def run_job_on_hashtags_in_dfs(mr_class, output_file): job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} print 'Running map reduce with the following params:' pprint(PARAMS_DICT) print 'Hadoop job conf:' pprint(job_conf) runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print "Generating stats for: ", length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)} FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
def run(): for graphType, method in [\ # (RandomGraphGenerator.fast_gnp_random_graph, RandomGraphGenerator.fastGnp), # (RandomGraphGenerator.erdos_renyi_graph, RandomGraphGenerator.erdosRenyi), # (RandomGraphGenerator.newman_watts_strogatz_graph, RandomGraphGenerator.nWS), (RandomGraphGenerator.powerlaw_cluster_graph, RandomGraphGenerator.powerlawClusterGraph), ]: for i in range(1, 11): FileIO.writeToFileAsJson({'n': 100*i, 'graphs': method(1000*i)}, randomGraphsFolder%graphType)
def generateLocationClusterData(): # p = Pool() # totalLocations = len(list(locationClusterIterator())) # i=1 for location in locationClusterIterator(): location = clusterLocation(location) # print '%s of %s'%(i,totalLocations) FileIO.writeToFileAsJson(location, locationClustersFile)
def writeClusterKML(): kml = SpotsKML() outputKMLFile='%s/clusters.kml'%placesAnalysisFolder%place['name'] for data in FileIO.iterateJsonFromFile(placesUserClusterFeaturesFile%place['name']): clusterId, color, features = data kml.addLocationPointsWithTitles([(getLocationFromLid(f[0].replace('_', ' ')), f[2]) for f in features[:noOfFeatures]], color=color) FileIO.createDirectoryForFile(outputKMLFile) kml.write(outputKMLFile)
def writeUserClustersFile(place): print 'Generating clusters...' userVectors = GenerateDataFiles.getUserVectors(place) GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name']) clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1') # clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2') for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]} for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
def run_job(mr_class, output_file, input_files_start_time, input_files_end_time): PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple()) PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple()) print 'Running map reduce with the following params:', pprint(PARAMS_DICT) runMRJob(mr_class, output_file, MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':500}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def iterateTweetsFromHouston(houstonDataStartTime=datetime(2010, 11, 1), houstonDataEndTime=datetime(2011, 5, 30)): currentTime = houstonDataStartTime while currentTime <= houstonDataEndTime: for tweet in TwitterIterators.iterateFromFile( houston_twitter_stream_settings.twitter_users_tweets_folder + '%s.gz' % FileIO.getFileByDay(currentTime)): yield tweet currentTime += timedelta(days=1)
def getStatsForSSA(): batchSize = 10000 default_experts_twitter_stream_settings['ssa_threshold'] = 0.75 for id in range(21, 50): fileName = time_to_process_points + '%s/%s' % (batchSize, id) ts = time.time() sstObject = SimilarStreamAggregation( dict(iterateUserDocuments(fileName)), default_experts_twitter_stream_settings['ssa_threshold']) sstObject.estimate() # documentClusters = list(sstObject.iterateClusters()) iteration_data = { 'iteration_time': time.time() - ts, 'type': 'ssa', 'number_of_messages': batchSize * (id + 1), 'batch_size': batchSize } FileIO.writeToFileAsJson(iteration_data, ssa_stats_file)
def loadExperimentsData(experimentsData, file): for data in FileIO.iterateJsonFromFile(file): if data['purity'] > 0 and data['purity'] < 1: experimentsData[data['iteration_parameters'] ['type']]['iteration_time'].append( data['iteration_time']) experimentsData[data['iteration_parameters'] ['type']]['quality'].append(data['purity']) experimentsData[data['iteration_parameters'] ['type']]['total_clusters'].append( data['iteration_parameters']['total_clusters'])
def plotDimensionsEstimation(self, returnAxisValuesOnly=True): def calculateDimensionsFor(params, percentageOfNewDimensions): ''' numberOfTimeUnits=10*24*12 Experts stream [ 1.17707899e+03 1.03794580e+00] 76819 Houston stream [ 2.73913900e+03 1.02758516e+00] 195731 ''' print getSmallestPrimeNumberGreaterThan( int( CurveFit.inverseOfDecreasingExponentialFunction( params, percentageOfNewDimensions))) dataDistribution = defaultdict(list) for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile): for k, v in line[ ParameterEstimation.dimensionsEstimationId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0., 0.] dataDistribution[k][0] += v dataDistribution[k][1] += 1 x, y = [], [] [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution) if k > 1000] x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits] exponentialCurveParams = CurveFit.getParamsAfterFittingData( x, y, CurveFit.decreasingExponentialFunction, [1., 1.]) print self.stream_settings[ 'plot_label'], exponentialCurveParams, calculateDimensionsFor( exponentialCurveParams, 0.01) plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel( getLatexForString('\# of dimensions') ), plt.title( getLatexForString( 'Dimension stability with increasing number of dimensions.')) plt.semilogy( x, y, 'o', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{-%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), lw=2) plt.semilogy(x, CurveFit.getYValues( CurveFit.decreasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.legend() if returnAxisValuesOnly: plt.show()
def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True): ''' This determines the time after which a cluster can be considered decayed and hence removed. Experts stream [ 0.66002386 0.07035227] 0.1 82 Houston stream [ 0.73800037 0.05890473] 0.1 29 458 (# of time units) Experts stream [ 0.66002386 0.07035227] 0.2 15 71 (# of time units) Houston stream [ 0.73756656 0.05883258] 0.2 3 ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int( CurveFit.inverseOfIncreasingExponentialFunction( params, 1 - probabilityOfInactivity)) data = list( FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings[ '%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1] total = float( sum(data['lag_between_streams_added_to_cluster'].values())) x = sorted( map(int, data['lag_between_streams_added_to_cluster'].keys())) y = getCumulativeDistribution([ data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x ]) exponentialCurveParams = CurveFit.getParamsAfterFittingData( x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings[ 'plot_label'], exponentialCurveParams, calculateInActivityTimeFor( exponentialCurveParams, 0.2) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel( getLatexForString(xlabelTimeUnits)), plt.title( getLatexForString('CDF for clusters lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [ i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10) ]: # for t in range(1, 16): for t in range(16, 21): stream_settings[ 'threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings[ 'threshold_for_document_to_be_in_cluster'] stats = { 'streaming_lsh': KMeansTweetsFile(length, **stream_settings). generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings) } FileIO.writeToFileAsJson(stats, stats_file)
def writeARFFForClustering(data, relationName): keyToIdMap = {} fileName = '/tmp/' + relationName + '.arff' os.system('rm -rf %s' % fileName) for docId in sorted(data): docVector = data[docId] for k, v in docVector.iteritems(): if k not in keyToIdMap: keyToIdMap[k] = len(keyToIdMap) FileIO.writeToFile(ARFF.getRelationLine(relationName), fileName) for attributeName in keyToIdMap: FileIO.writeToFile(ARFF.getAttributeLine(attributeName), fileName) FileIO.writeToFile('@data', fileName) for d in data.iteritems(): FileIO.writeToFile(ARFF.getDataLine(d, keyToIdMap), fileName) return fileName
def plotClusteringSpeed(saveFig=True): plotSettings = { 'k_means': { 'label': 'Iterative k-means', 'color': '#FD0006' }, 'mr_k_means': { 'label': 'MR k-means', 'color': '#5AF522' }, 'streaming_lsh': { 'label': 'Stream CDA', 'color': '#7109AA' }, } dataToPlot = { 'k_means': { 'x': [], 'y': [] }, 'mr_k_means': { 'x': [], 'y': [] }, 'streaming_lsh': { 'x': [], 'y': [] } } for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file): for k in plotSettings: dataToPlot[k]['x'].append(data[k]['no_of_documents']) dataToPlot[k]['y'].append(data[k]['iteration_time']) for k in plotSettings: plt.loglog(dataToPlot[k]['x'], dataToPlot[k]['y'], label=plotSettings[k]['label'], color=plotSettings[k]['color'], lw=2) plt.legend(loc=4) if saveFig: plt.xlabel(getLatexForString('\# of documents')) plt.ylabel(getLatexForString('Running time (s)')) plt.title( getLatexForString( 'Running time comparsion for Streaing LSH with k-Means')) plt.xlim(xmin=800, xmax=100000) plt.xticks([]) # plt.show() if saveFig: plt.savefig('speedComparisonWithKMeans.pdf')
def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime): phrasesLagDistribution = defaultdict(int) for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues( ): lag = DateTimeAirthematic.getDifferenceInTimeUnits( currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds']. seconds) phrasesLagDistribution[str(lag)] += 1 print currentMessageTime iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionInActivityTimeId: estimationObject.lagBetweenMessagesDistribution, 'phrases_lag_distribution': phrasesLagDistribution } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
def iterateTweetsFromExperts(expertsDataStartTime=datetime(2011, 3, 19), expertsDataEndTime=datetime(2011, 4, 12)): experts = getExperts() currentTime = expertsDataStartTime while currentTime <= expertsDataEndTime: for tweet in TwitterIterators.iterateFromFile( experts_twitter_stream_settings.twitter_users_tweets_folder + '%s.gz' % FileIO.getFileByDay(currentTime)): if tweet['user']['id_str'] in experts: if getDateTimeObjectFromTweetTimestamp( tweet['created_at']) <= expertsDataEndTime: yield tweet else: return currentTime += timedelta(days=1)
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): global evaluation, previousTime currentTime = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings['cluster_filter_threshold'] ] iteration_data = evaluation.getEvaluationMetrics( documentClusters, currentTime - previousTime, { 'type': experts_twitter_stream_settings['trie_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime) }) previousTime = time.time() FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file) del iteration_data['clusters'] print getStringRepresentationForTweetTimestamp( currentMessageTime), iteration_data
def iterateUserDocuments(fileName): dataForAggregation = defaultdict(Vector) textToIdMap = defaultdict(int) for tweet in FileIO.iterateJsonFromFile(fileName): textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage( tweet, **default_experts_twitter_stream_settings).vector textIdVector = Vector() for phrase in textVector: if phrase not in textToIdMap: textToIdMap[phrase] = str(len(textToIdMap)) textIdVector[textToIdMap[phrase]] = textVector[phrase] dataForAggregation[tweet['user'] ['screen_name'].lower()] += textIdVector for k, v in dataForAggregation.iteritems(): yield k, v
def streamingLSHClusteringDemo(): clustering_settings = {'dimensions': 53, 'signature_length': 13, 'number_of_permutations': 5, 'threshold_for_document_to_be_in_cluster': 0.2} clustering=StreamingLSHClustering(**clustering_settings) docId = 0 docsToOriginalClusterMap = {} for line in FileIO.iterateLinesFromFile('../data/streaming.dat'): document = createDocumentFromLine(docId, line) docsToOriginalClusterMap[docId] = document.clusterId docId+=1 clustering.getClusterAndUpdateExistingClusters(document) clusterLabels = [] for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()]) return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
def plotICDFDimensionsInactivityThreshold(self, returnAxisValuesOnly=True): ''' Plot P(in_actiivty > threshold timeunit) Find time unit at which probability is low. Experts stream 0.25 129 Houston stream 0.25 144 ''' dataX, dataY, total = set(), defaultdict(list), [] for line in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile)): data = dict((int(k), v) for k,v in line[ParameterEstimation.dimensionInActivityTimeId].iteritems()) total.append(sum(data.values())) for i in data: dataY[i].append(data[i]); dataX.add(i) totalInstancesObserved=float(sum(total)) x = sorted(dataX) y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x]) plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for dimensions.')) plt.legend() if returnAxisValuesOnly: plt.show()
def plotClusteringQuality(): del plotSettings['ssa_mr'] speedStats = dict([(k, { 'f1': [], 'nmi': [], 'purity': [] }) for k in plotSettings]) for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file): for k in speedStats: for metric in speedStats['ssa']: speedStats[k][metric].append(data[k][metric]) dataForPlot = dict([(k, []) for k in plotSettings]) for k, v in speedStats.iteritems(): print k for k1, v1 in v.iteritems(): if type(v1[0]) != type([]): print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1)) dataForPlot[k] += [np.mean(v1)] else: print k1, [ '(%0.2f %0.2f)' % (np.mean(z), np.var(z)) for z in zip(*v1) ] dataForPlot[k] += [np.mean(z) for z in zip(*v1)] ind, width = np.arange(5), 0.1 rects, i = [], 0 for k in dataForPlot: rects.append( plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]['color'])) i += 1 plt.ylabel(getLatexForString('Score')) plt.title( getLatexForString( 'Clustering quality comparison for Streaming LSH with SSA')) plt.xticks(ind + width, ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$')) plt.legend([r[0] for r in rects], [plotSettings[k]['label'] for k in plotSettings], loc=4) # plt.show() plt.savefig('qualityComparisonWithSSA.pdf')
def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True): ''' This gives us the percentage of phrases we can loose everytime we prune phrases. Measures the percentage of dimensions having lag less than TU. So at the end of 10th day, almost y% of phrases can be removed. With some probabiity that it will not occure again. numberOfTimeUnits=10*24*12 With 75% probability. Experts stream [ 0.0097055 0.81888514] 107 0.554497397565 Houston stream [ 0.00943499 0.825918 ] 126 0.487757815615 With 90% probability. Experts stream [ 0.0097055 0.81888514] 223 0.187150798756 Houston stream [ 0.00943499 0.825918 ] 228 0.164007589276 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]: totalDimensions = float(sum(data['phrases_lag_distribution'].values())) tempArray = [] for k, v in data['phrases_lag_distribution'].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalDimensions tempArray.append(v / totalDimensions) currentTimeUnit += 1 x = sorted(dataDistribution) y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag')) else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotICDFClustersLagDistribution(self, returnAxisValuesOnly=True): ''' Experts stream 0.25 199 Houston stream 0.25 152 ''' self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = self.stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId dataX, dataY, total = set(), defaultdict(list), [] for line in list(FileIO.iterateJsonFromFile(self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])): print line.keys() data = dict((int(k), v) for k,v in line[ClusteringParametersEstimation.clusterLagDistributionId].iteritems()) total.append(sum(data.values())) for i in data: dataY[i].append(data[i]); dataX.add(i) totalInstancesObserved=float(sum(total)) x = sorted(dataX) y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x]) plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['plot_label']=='Houston stream': plt.plot([0,x[-1]], [1, 0], '--', color='#5AF522', lw=2) plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for crowds.')) plt.legend() if returnAxisValuesOnly: plt.show()
def plotThresholdForDocumentToBeInCluster(self, statsFile): dataToPlot = dict(('%0.2f' % (t * 0.05), {'iteration_time':[], 'purity': [], 'nmi': []}) for t in range(1, 21)) for data in FileIO.iterateJsonFromFile(statsFile): threshold = '%0.2f' % data['settings']['threshold_for_document_to_be_in_cluster'] for k in dataToPlot[threshold]: dataToPlot[threshold][k] += [data['streaming_lsh'][k]] for t in dataToPlot: for k in dataToPlot[t]: dataToPlot[t][k] = np.mean(dataToPlot[t][k]) dataX = sorted([float(i) for i in dataToPlot])[:-1] print dataX # Plot iteration time. plt.subplot(211) plt.plot(dataX, [dataToPlot['%0.2f' % x]['iteration_time'] for x in dataX], lw=2, color='k') plt.ylabel(getLatexForString('Time (s)')) plt.title(getLatexForString('Estimation of \epsilon^\prime for Stream SSA')) plt.subplot(212) for metric, label, color in [('nmi', 'NMI', '#F60018'), ('purity', 'Purity', '#25D500')]: plt.plot(dataX, [dataToPlot['%0.2f' % x][metric] for x in dataX], label=label, color=color, lw=2) plt.ylabel(getLatexForString('Score')) plt.xlabel(getLatexForString('Similarity threshold (\epsilon^\prime)')) plt.legend(loc=4) plt.show()
def plotJustifyDimensionsEstimation(self): runningTimeData, purityData = defaultdict(list), defaultdict(list) for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file): if data['iteration_parameters']['dimensions']<data['no_of_observed_dimensions']: no_of_dimensions = data['iteration_parameters']['dimensions'] runningTimeData[no_of_dimensions].append(data['iteration_time']), purityData[no_of_dimensions].append(data['purity']) plt.subplot(111) dataX, dataY = [], [] del purityData[169991]; del purityData[39989] plt.title('Impact of dimension estimation') for k in sorted(purityData): dataX.append(k), dataY.append(np.mean(purityData[k])) plt.semilogx(dataX, [0.96]*len(dataX), '--', label='Top n dimensions', color='#7109AA', lw=2) plt.semilogx(dataX, [np.mean(dataY)]*len(dataX), '--', color='#5AF522', lw=2) plt.semilogx(dataX, dataY, '-x', label='Fixed dimensions', color='#5AF522', lw=2) plt.ylim(0.8, 1.0) plt.xlim(7000, 203000) plt.xlabel('# of dimensions') plt.ylabel('Purity') plt.legend(loc=3) plt.savefig('justifyDimensionsEstimation.pdf')
def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True): ''' Inactivity time is the time after which there is a high probability that a dimension will not appear. Find time_unit that gives this probability. Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function) lag = time betweeen occurance of two dimensions (similar to inactivty_time) F(time_unit) = P(lag<=time_unit) time_unit = F_inv(P(lag<=time_unit)) Given P(inactivty_time>time_unit) determine time_unit as shown: P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit) inactivty_time = F_inv(P(inactivty_time<=time_unit)) numberOfTimeUnits=10*24*12 Experts stream [ 0.23250341 0.250209 ] 0.25 107 Houston stream [ 0.16948096 0.30751358] 0.25 126 Experts stream [ 0.23250341 0.250209 ] 0.1, 223 Houston stream [ 0.16948096 0.30751358] 0.1, 228 Compared to other vaues these values are pretty close to each other. This is expected. Irrespective of size of the streams, the phrases have the same lifetime and hence decay close to each other. ''' def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity)) data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits] total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values())) x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys())) y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x]) print len(x) exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2) plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.')) plt.ylim((0, 1.2)) plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotJustifyDimensionsEstimation2(self): pltInfo = {JustifyDimensionsEstimation.top_n_dimension: {'label': 'Temporally significant', 'color': '#7109AA', 'type': '-', 'marker': 'x'}, JustifyDimensionsEstimation.first_n_dimension: {'label': 'By occurrence', 'color': '#5AF522', 'type': '-', 'marker': 'o'}} # experimentsData = {JustifyMemoryPruning.with_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyMemoryPruning.without_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}} experimentsData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(dict), JustifyDimensionsEstimation.first_n_dimension: defaultdict(dict)} for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file_2): # for data in FileIO.iterateJsonFromFile('temp/dimensions_need_analysis_2'): # if 'dimensions' in data['iteration_parameters']: dimension = data['iteration_parameters']['dimensions'] type = data['iteration_parameters']['type'] if dimension not in experimentsData[type]: experimentsData[type][dimension] = {'iteration_time': [], 'quality': [], 'total_clusters': []} experimentsData[type][dimension]['iteration_time'].append(data['iteration_time']), experimentsData[type][dimension]['quality'].append(data['purity']), experimentsData[type][dimension]['total_clusters'].append(data['no_of_clusters']) lshData = dict([(k, np.mean(experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819][k])) for k in experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]]) del experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819] print lshData plotData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(list), JustifyDimensionsEstimation.first_n_dimension: defaultdict(list)} for type in experimentsData: for dimension in sorted(experimentsData[type]): plotData[type]['dataX'].append(dimension); [plotData[type][k].append(np.mean(experimentsData[type][dimension][k])) for k in experimentsData[type][dimension]] plt.subplot(311); for type in experimentsData: plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['total_clusters'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2); plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['total_clusters']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2); plt.ylim(ymin=1) plt.subplot(312); for type in experimentsData: plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['iteration_time'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2); plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['iteration_time']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819'), lw=2); plt.ylim(ymin=1, ymax=1500) plt.legend(loc=2, ncol=2) plt.subplot(313); for type in experimentsData: plt.plot([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['quality'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2, marker=pltInfo[type]['marker']); plt.ylabel('Mean purity per iteration', fontsize=20); # plt.title(getLatexForString('Impact of dimension ranking')) plt.xlabel('# number of dimensions $(10^3)$', fontsize=20) # plt.plot([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['quality']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2); plt.ylim(ymin=0.80,ymax=1.0) plt.legend() plt.savefig('justifyDimensionsEstimation2.png')
def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True): ''' 458 Experts stream [ 0.01860266 0.70639136] 15 0.874004297177 80 Houston stream [ 0.0793181 0.47644004] 3 0.866127308876 ''' def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit) dataDistribution = {} currentTimeUnit = 0 # file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution' file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] lines = list(FileIO.iterateJsonFromFile(file)) numberOfTimeUnits = len(lines) for data in lines: totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values())) tempArray = [] for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems(): k = int(k) if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits dataDistribution[k][currentTimeUnit] = v / totalClusters tempArray.append(v / totalClusters) currentTimeUnit += 1 x = sorted(dataDistribution) print numberOfTimeUnits, y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x]) params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.]) print self.stream_settings['plot_label'], params, def subPlot(id, timeUnit): plt.subplot(id) print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit) plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color']) plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2) if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag')) else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits)) plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$') plt.legend(loc=4) if returnAxisValuesOnly: plt.show()
def plotClusteringSpeed(saveFig=True): dataToPlot = dict([(k, {'x': [], 'y': []}) for k in plotSettings]) for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file): for k in plotSettings: dataToPlot[k]['x'].append(data[k]['no_of_documents']) dataToPlot[k]['y'].append(data[k]['iteration_time']) for k in plotSettings: plt.loglog(dataToPlot[k]['x'], movingAverage(dataToPlot[k]['y'], 1), label=plotSettings[k]['label'], color=plotSettings[k]['color'], lw=2) print dataToPlot['streaming_lsh']['x'][10] print dataToPlot['streaming_lsh']['y'][10] plt.legend(loc=4) if saveFig: plt.xlabel(getLatexForString('\# of documents')) plt.ylabel(getLatexForString('Running time (s)')) plt.title( getLatexForString( 'Running time comparsion for Streaing LSH with SSA')) plt.xlim(xmin=500, xmax=600000) # plt.show() if saveFig: plt.savefig('speedComparisonWithSSA.pdf')
def getIterator(id): for line in FileIO.iterateJsonFromFile(time_to_process_points + 'stats/%s' % id): yield line
def fileIterator(): for id in xrange(20): yield FileIO.iterateJsonFromFile(time_to_process_points + '%s' % id)
def kmeans(): for data in FileIO.iterateJsonFromFile( clustering_quality_experts_folder + 'combined_stats_file'): yield data['k_means']
def kmeansmr(): for data in FileIO.iterateJsonFromFile( clustering_quality_experts_folder + 'mr_quality_stats'): yield data['mr_k_means']
def cdait(): for data in FileIO.iterateJsonFromFile( clustering_quality_experts_ssa_folder + 'quality_stats'): if 'ssa' in data: yield data['ssa']
def cdamr(): for data in FileIO.iterateJsonFromFile( clustering_quality_experts_ssa_folder + 'quality_stats'): yield data['ssa_mr']
def unoptimized(): for data in FileIO.iterateJsonFromFile( hd_clustering_performance_folder + 'cda_unopt'): yield data['streaming_lsh']