Python FileIO.iterateJsonFromFile Beispiele, library.file_io.FileIO.iterateJsonFromFile Python Beispiele

Beispiel #1

0

Datei anzeigen

 def generateCombinedStatsFile():
     for normalData, mrData in zip(
             FileIO.iterateJsonFromFile(TweetsFile.stats_file),
             FileIO.iterateJsonFromFile(TweetsFile.mr_stats_file)):
         normalData['mr_k_means'] = mrData['mr_k_means']
         FileIO.writeToFileAsJson(normalData,
                                  TweetsFile.combined_stats_file)

Beispiel #2

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: kykamath/hd_streams_clustering

 def plotQualityWithKMeansAndSSA():
     del plotSettings["ssa_mr"]
     speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats["ssa"]:
                 speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         del speedStats[k]["f1"]
     speedStats.update(dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in kMeansPlotSettings]))
     k = "k_means"
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for metric in speedStats["k_means"]:
             speedStats[k][metric].append(data[k][metric])
     for k in speedStats:
         if "f1" in speedStats[k]:
             del speedStats[k]["f1"]
     dataForPlot = dict([(k, []) for k in speedStats])
     for k in speedStats:
         for k1 in speedStats[k]:
             dataForPlot[k] += [np.mean(speedStats[k][k1])]
     #        del dataForPlot['k_means']
     print dataForPlot
     ind, width = np.arange(2), 0.1
     rects, i = [], 1
     plotSettings.update(kMeansPlotSettings)
     for k in dataForPlot:
         rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"]))
         i += 1
     plt.ylabel(getLatexForString("Score"))
     plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA"))
     plt.xticks(ind + 2 * width, ("$Purity$", "$NMI$"))
     plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4)
     #        plt.show()
     plt.savefig("qualityComparisonAll.pdf")

Beispiel #3

0

Datei anzeigen

Datei: analysis.py Projekt: kykamath/hashtags_and_geo

    def generate_data_for_significant_nei_utm_ids():
        output_file = GeneralMethods.get_method_id()+'.json'
        so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
        for utm_object in \
                FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
            for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
            mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                            utm_object['mf_nei_utm_id_to_common_h_count'].keys()
        hashtags = sorted(list(so_hashtags))
        mf_utm_id_to_vector = {}
        for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
#                print i, utm_object['utm_id']
            utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                 hashtags)
            mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
        for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
            print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
            ltuo_utm_id_and_vector = [(utm_id, vector)]
            for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
                if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
                    ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
            dfm_dict = cjson.decode(df_utm_vectors_json)
            mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
            utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
            dfm_dict['prediction_variable'] = utm_id_colname
            dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
                                                     df_utm_vectors.colnames)
            dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
            FileIO.writeToFileAsJson(dfm_dict, output_file)

Beispiel #4

0

Datei anzeigen

Datei: target_selection_regression_classifier.py Projekt: kykamath/hashtags_and_geo

def build(numberOfTimeUnits=24):
    validLattices = set()
    for data in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%('world','%s_%s'%(2,11))): validLattices.add(data['id'])
    documents, lattices = [], set()
    for h in FileIO.iterateJsonFromFile(hashtagsFile%('training_world','%s_%s'%(2,11))): 
        hashtag, document = Hashtag(h), []
        if hashtag.isValidObject():
            for timeUnit, occs in enumerate(hashtag.getOccrancesEveryTimeWindowIterator(HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)):
                occs = filter(lambda t: t[0] in validLattices, occs)
                occs = sorted(occs, key=itemgetter(0))
                if occs: 
                    for lattice in zip(*occs)[0]: lattices.add(lattice)
                document.append([timeUnit, [(k, len(list(i))) for k, i in groupby(occs, key=itemgetter(0))]])
            if document: documents.append(document)
    lattices = sorted(list(lattices))
    print len(lattices)
    documents = [(d, TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(d)) for d in documents]
    documents = documents[:int(len(documents)*0.80)]
    for decisionTimeUnit in range(1, numberOfTimeUnits+1):
        for latticeCount, predictingLattice in enumerate(lattices):
            print decisionTimeUnit, latticeCount,
            inputVectors, outputValues = [], []
            for rawDocument, processedDocument in documents:
                documentForTimeUnit = TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(rawDocument[:decisionTimeUnit])
                if documentForTimeUnit and processedDocument:
                    vector =  [documentForTimeUnit.get(l, 0) for l in lattices]
                    inputVectors.append(vector), outputValues.append(float(processedDocument.get(predictingLattice, 0)))
#            TargetSelectionRegressionClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues))
            TargetSelectionRegressionSVMRBFClassifier(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice).build(zip(inputVectors, outputValues))

Beispiel #5

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: ylaron/hd_streams_clustering

    def plotQualityWithKMeansAndSSA():
        del plotSettings['ssa_mr']
        speedStats = dict([(k, {
            'f1': [],
            'nmi': [],
            'purity': []
        }) for k in plotSettings])
        for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
            for k in speedStats:
                for metric in speedStats['ssa']:
                    speedStats[k][metric].append(data[k][metric])
        for k in speedStats:
            del speedStats[k]['f1']
        speedStats.update(
            dict([(k, {
                'f1': [],
                'nmi': [],
                'purity': []
            }) for k in kMeansPlotSettings]))
        k = 'k_means'
        for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
            for metric in speedStats['k_means']:
                speedStats[k][metric].append(data[k][metric])
        for k in speedStats:
            if 'f1' in speedStats[k]: del speedStats[k]['f1']
        dataForPlot = dict([(k, []) for k in speedStats])
        for k in speedStats:
            for k1 in speedStats[k]:
                dataForPlot[k] += [np.mean(speedStats[k][k1])]


#        del dataForPlot['k_means']
        print dataForPlot
        ind, width = np.arange(2), 0.1
        rects, i = [], 1
        plotSettings.update(kMeansPlotSettings)
        for k in dataForPlot:
            rects.append(
                plt.bar(ind + i * width,
                        dataForPlot[k],
                        width,
                        color=plotSettings[k]['color']))
            i += 1
        plt.ylabel(getLatexForString('Score'))
        plt.title(
            getLatexForString(
                'Clustering quality comparison for Streaming LSH with SSA'))
        plt.xticks(ind + 2 * width, ('$Purity$', '$NMI$'))
        plt.legend([r[0] for r in rects],
                   [plotSettings[k]['label'] for k in plotSettings],
                   loc=4)
        #        plt.show()
        plt.savefig('qualityComparisonAll.pdf')

Beispiel #6

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

    def coverageIndication():
        MINUTES = 5
        for timeUnit, color, shape in [(1, 'r', 'x'), (3, 'g', 'd'), (6, 'b', 's')]:
            print timeUnit
            data = defaultdict(int)
            for hashtagObject in FileIO.iterateJsonFromFile(hashtagsFile%('training_world','%s_%s'%(2,11))):
                try:
                    occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False)
                    occurances = list(zip(*sorted(occsDistributionInTimeUnits.iteritems(), key=itemgetter(0)))[1])
                    occsInTimeunit =  zip(*reduce(lambda aggList, l: aggList+l, occurances[:timeUnit], []))[0]
                    if len(occsInTimeunit)>10:
                        allOccurances = zip(*reduce(lambda aggList, l: aggList+l, occurances, []))[0]
                        timeUnitRadius, allRadius = getRadius(occsInTimeunit), getRadius(allOccurances)
                        data[int(abs(timeUnitRadius-allRadius))/50*50+50]+=1
#                        data[round(abs(timeUnitRadius-allRadius)/allRadius, 2)]+=1
                except IndexError as e: pass
            for k in data.keys()[:]: 
                if data[k]<3: del data[k]
            dataX, dataY = zip(*sorted(data.iteritems(), key=itemgetter(0)))
            plt.loglog(dataX, dataY, lw=2, label=str(timeUnit*MINUTES) + ' minutes', marker=shape)
#        plt.loglog([1],[1])
#        plt.title('Early indication of coverage'), 
        plt.xlabel('Coverage difference (miles)', fontsize=20), plt.ylabel('Number of hashtags', fontsize=20)
        plt.legend()
#        plt.show()
        plt.savefig('../images/coverageIndication.png')

Beispiel #7

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

    def temporalLocalityTemporalDistanceExample(lattice=NEW_YORK):
        distances = defaultdict(dict)
        for latticeObject in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%('training_world','%s_%s'%(2,11))):
            if latticeObject['id']==lattice:
                latticeHashtagsSet = set(latticeObject['hashtags'])
                for neighborLattice, neighborHashtags in latticeObject['links'].iteritems():
                    distances[neighborLattice] = {}
                    neighborHashtags = filterOutNeighborHashtagsOutside1_5IQROfTemporalDistance(latticeObject['hashtags'], neighborHashtags, findLag=False)
                    neighborHashtagsSet = set(neighborHashtags)
                    distances[neighborLattice]['similarity']=len(latticeHashtagsSet.intersection(neighborHashtagsSet))/float(len(latticeHashtagsSet.union(neighborHashtagsSet)))
                    distances[neighborLattice]['temporalDistance']=np.mean([abs(latticeObject['hashtags'][k][0]-neighborHashtags[k][0]) for k in neighborHashtags if k in latticeObject['hashtags']])/(60.*60.)
                    distances[neighborLattice]['geoDistance']=getHaversineDistanceForLids(latticeObject['id'].replace('_', ' '), neighborLattice.replace('_', ' '))
                break
        dataPoints = []
        ax = plt.subplot(111)
        for k, data in distances.iteritems(): dataPoints.append((getLocationFromLid(k.replace('_', ' ')), data['temporalDistance']))
        points, colors = zip(*sorted(dataPoints, key=itemgetter(1)))
        sc = plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', cmap='RdPu', c=colors, lw = 0, alpha=1.0)
        plotPointsOnWorldMap([getLocationFromLid(lattice.replace('_', ' '))], blueMarble=False, bkcolor='#CFCFCF', c='#64FF1C', lw = 0)
        divider = make_axes_locatable(ax)
        plt.title('Average time difference from New York')
        cax = divider.append_axes("right", size="5%", pad=0.05)
        plt.colorbar(sc, cax=cax)
#        plt.show()
        plt.savefig('../images/temporalDistanceExample.png')

Beispiel #8

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: kykamath/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     plotSettings = {
         "k_means": {"label": "Iterative k-means", "color": "#FD0006"},
         "mr_k_means": {"label": "MR k-means", "color": "#5AF522"},
         "streaming_lsh": {"label": "Stream CDA", "color": "#7109AA"},
     }
     dataToPlot = {
         "k_means": {"x": [], "y": []},
         "mr_k_means": {"x": [], "y": []},
         "streaming_lsh": {"x": [], "y": []},
     }
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in plotSettings:
             dataToPlot[k]["x"].append(data[k]["no_of_documents"])
             dataToPlot[k]["y"].append(data[k]["iteration_time"])
     for k in plotSettings:
         plt.loglog(
             dataToPlot[k]["x"],
             dataToPlot[k]["y"],
             label=plotSettings[k]["label"],
             color=plotSettings[k]["color"],
             lw=2,
         )
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString("\# of documents"))
         plt.ylabel(getLatexForString("Running time (s)"))
         plt.title(getLatexForString("Running time comparsion for Streaing LSH with k-Means"))
     plt.xlim(xmin=800, xmax=100000)
     plt.xticks([])
     #        plt.show()
     if saveFig:
         plt.savefig("speedComparisonWithKMeans.pdf")

Beispiel #9

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: kykamath/hd_streams_clustering

 def plotClusteringQuality():
     del plotSettings["ssa_mr"]
     speedStats = dict([(k, {"f1": [], "nmi": [], "purity": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats["ssa"]:
                 speedStats[k][metric].append(data[k][metric])
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, "(%0.2f %0.2f)" % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, ["(%0.2f %0.2f)" % (np.mean(z), np.var(z)) for z in zip(*v1)]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(plt.bar(ind + i * width, dataForPlot[k], width, color=plotSettings[k]["color"]))
         i += 1
     plt.ylabel(getLatexForString("Score"))
     plt.title(getLatexForString("Clustering quality comparison for Streaming LSH with SSA"))
     plt.xticks(ind + width, ("$F$", "$Precision$", "$Recall$", "$Purity$", "$NMI$"))
     plt.legend([r[0] for r in rects], [plotSettings[k]["label"] for k in plotSettings], loc=4)
     #        plt.show()
     plt.savefig("qualityComparisonWithSSA.pdf")

Beispiel #10

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: kykamath/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = dict([(k, {"x": [], "y": []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in plotSettings:
             dataToPlot[k]["x"].append(data[k]["no_of_documents"])
             dataToPlot[k]["y"].append(data[k]["iteration_time"])
     for k in plotSettings:
         plt.loglog(
             dataToPlot[k]["x"],
             movingAverage(dataToPlot[k]["y"], 1),
             label=plotSettings[k]["label"],
             color=plotSettings[k]["color"],
             lw=2,
         )
     print dataToPlot["streaming_lsh"]["x"][10]
     print dataToPlot["streaming_lsh"]["y"][10]
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString("\# of documents"))
         plt.ylabel(getLatexForString("Running time (s)"))
         plt.title(getLatexForString("Running time comparsion for Streaing LSH with SSA"))
     plt.xlim(xmin=500, xmax=600000)
     #        plt.show()
     if saveFig:
         plt.savefig("speedComparisonWithSSA.pdf")

Beispiel #11

0

Datei anzeigen

Datei: analysis.py Projekt: kykamath/hashtags_and_geo

    def measureCorrelations(timeRange, outputFolder):
        '''
        ['haversine_distance', 'temporal_distance_in_hours', 0.20147108648121248]
        ['haversine_distance', 'sharing_probability', -0.19587239643328627]
        '''
        measures = [
                    (LatticeGraph.typeHaversineDistance, LatticeGraph.typeTemporalDistanceInHours),
                    (LatticeGraph.typeHaversineDistance, LatticeGraph.typeSharingProbability),
                    ]
        runData = []
        for xMeasure, yMeasure in measures:
            i, xdata, ydata = 1, [], []
            for latticeObject in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%(outputFolder,'%s_%s'%timeRange)):
                print i, latticeObject['id']; i+=1
                xdata+=zip(*xMeasure['method'](latticeObject)['links'].iteritems())[1]
                ydata+=zip(*yMeasure['method'](latticeObject)['links'].iteritems())[1]
#                if i==200: break
            preasonsCorrelation, _ = stats.pearsonr(xdata, ydata)
#            plt.scatter(xdata[:5000], ydata[:5000])
#            plt.title('Pearson\'s co-efficient %0.3f'%preasonsCorrelation)
#            plt.xlabel(xMeasure['title']), plt.ylabel(yMeasure['title'])
#            plt.show()
            runData.append([xMeasure['id'], yMeasure['id'], preasonsCorrelation])
        for i in runData:
            print i

Beispiel #12

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Beispiel #13

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

    def probabilisticCoverageModelExample(hashtag, type):
        MINUTES, timeUnit = 5, 1
        print len(CoverageBasedLatticeSelectionModel.lattices)
        for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'):
            if hashtagObject['h']==hashtag:
                occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtagObject), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False)
                occurances = list(zip(*sorted(occsDistributionInTimeUnits.iteritems(), key=itemgetter(0)))[1])
                occsInTimeunit =  zip(*reduce(lambda aggList, l: aggList+l, occurances[:timeUnit], []))[0]
                allOccurances = zip(*reduce(lambda aggList, l: aggList+l, occurances, []))[0]
                if type=='5m': probabilityDistributionForObservedLattices = CoverageBasedLatticeSelectionModel.probabilityDistributionForLattices(occsInTimeunit)
                else: 
                    print getRadius(allOccurances)
                    probabilityDistributionForObservedLattices = CoverageBasedLatticeSelectionModel.probabilityDistributionForLattices(allOccurances)
                latticeScores = CoverageBasedLatticeSelectionModel.spreadProbability(CoverageBasedLatticeSelectionModel.lattices, probabilityDistributionForObservedLattices)
                points, colors = zip(*map(lambda t: (getLocationFromLid(t[0].replace('_', ' ')), t[1]), sorted(latticeScores.iteritems(), key=itemgetter(1))))
#                print points[0], colors[0]
                ax = plt.subplot(111)
                sc = plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c=colors, cmap='cool', lw = 0)
                divider = make_axes_locatable(ax)
#                plt.title('Jaccard similarity with New York')
                cax = divider.append_axes("right", size="5%", pad=0.05)
                plt.colorbar(sc, cax=cax)
                plt.show()
#                plt.savefig('../images/coverage_examples/%s_%s.png'%(hashtag, type))
                plt.clf()
                break

Beispiel #14

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

 def significant_nei_utm_ids():
     output_folder = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'/%s.png'
     for i, data in enumerate(FileIO.iterateJsonFromFile(f_significant_nei_utm_ids, remove_params_dict=True)):
         utm_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(data['utm_id'])
         nei_utm_lat_longs = map(
                           lambda nei_utm_id: UTMConverter.getLatLongUTMIdInLatLongForm(nei_utm_id),
                           data['nei_utm_ids']
                         )
         if nei_utm_lat_longs:
             output_file = output_folder%('%s_%s'%(utm_lat_long))
             plotPointsOnWorldMap(nei_utm_lat_longs,
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#EA00FF',
                                  alpha=1.)
             _, m = plotPointsOnWorldMap([utm_lat_long],
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#2BFF00',
                                  s = 40,
                                  returnBaseMapObject=True,
                                  alpha=1.)
             for nei_utm_lat_long in nei_utm_lat_longs:
                 m.drawgreatcircle(utm_lat_long[1],
                                   utm_lat_long[0],
                                   nei_utm_lat_long[1],
                                   nei_utm_lat_long[0],
                                   color='#FFA600',
                                   lw=1.5,
                                   alpha=1.0)
             print 'Saving %s'%(i+1)
             savefig(output_file)

Beispiel #15

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotDimensionsUpdateFrequencyEstimation(self, returnAxisValuesOnly=True):
     '''
     numberOfTimeUnits=10*24*12
     Experts stream 12
     Houston stream 2
     '''
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsUpdateFrequencyFile):
         for k, v in line[ParameterEstimation.dimensionsUpdateFrequencyId].iteritems():
             k = int(k) / self.timeUnitInSeconds.seconds
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]))) for k in sorted(dataDistribution)]
     x1, y1 = [], []; [(x1.append(k), y1.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]; x1 = x1[:numberOfTimeUnits]; y1 = y1[:numberOfTimeUnits]
     def subPlot(id):
         plt.subplot(id)
         inactivityCorordinates = max(zip(x1, y1), key=itemgetter(1))
         plt.semilogx(x1, y1, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label'] + ' (Update frequency=%d TU)' % inactivityCorordinates[0]), lw=2)
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.semilogx([inactivityCorordinates[0]], [inactivityCorordinates[1]], 'o', alpha=0.7, color='r')
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.yticks((min(y1), max(y1)))
         print self.stream_settings['plot_label'], inactivityCorordinates[0]
     plt.subplot(311)
     plt.title(getLatexForString('Dimensions update frequency estimation'))
     plt.semilogx(x, y, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.subplot(311).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 5)))
     plt.text(0.0, 1.01, getLatexForString('10^5'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of decayed dimensions'))
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(312)
     else: subPlot(313); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(getLatexForString('Rate of DD (10^3)'))
     plt.legend(loc=3)
     if returnAxisValuesOnly: plt.show()

Beispiel #16

0

Datei anzeigen

 def plotGrowthOfPhrasesInTime(self, returnAxisValuesOnly=True):
     '''
     This plot tells us the time when the number of phrases in the stream stablizes. 
     Consider the time after we have seen maximum phrases to determine dimensions.
     But, if these phrases increase linearly with time, it shows that we have infinte
     dimensions and hence this motivates us to have a way to determine number of 
     dimensions.
     
     numberOfTimeUnits=10*24*12
     '''
     x, y = [], []
     [(x.append(getDateTimeObjectFromTweetTimestamp(line['time_stamp'])),
       y.append(line['total_number_of_phrases']))
      for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile)]
     x = x[:numberOfTimeUnits]
     y = y[:numberOfTimeUnits]
     plt.subplot(111).yaxis.set_major_formatter(
         FuncFormatter(lambda x, i: '%0.1f' % (x / 10.**6)))
     plt.text(0.0,
              1.01,
              getLatexForString('10^6'),
              transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of dimensions')), plt.xlabel(
         getLatexForString(xlabelTimeUnits)), plt.title(
             getLatexForString(
                 'Growth in dimensions with increasing time.'))
     plt.plot(y,
              color=self.stream_settings['plot_color'],
              label=getLatexForString(self.stream_settings['plot_label']),
              lw=2)
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Beispiel #17

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: greeness/hd_streams_clustering

 def plotDimensionsUpdateFrequencyEstimation(self, returnAxisValuesOnly=True):
     '''
     numberOfTimeUnits=10*24*12
     Experts stream 12
     Houston stream 2
     '''
     dataDistribution = defaultdict(list)
     for line in FileIO.iterateJsonFromFile(self.dimensionsUpdateFrequencyFile):
         for k, v in line[ParameterEstimation.dimensionsUpdateFrequencyId].iteritems():
             k = int(k) / self.timeUnitInSeconds.seconds
             if k not in dataDistribution: dataDistribution[k] = [0., 0.]
             dataDistribution[k][0] += v; dataDistribution[k][1] += 1
     x, y = [], []; [(x.append(k), y.append((dataDistribution[k][0] / dataDistribution[k][1]))) for k in sorted(dataDistribution)]
     x1, y1 = [], []; [(x1.append(k), y1.append((dataDistribution[k][0] / dataDistribution[k][1]) / k)) for k in sorted(dataDistribution)]
     x = x[:numberOfTimeUnits]; y = y[:numberOfTimeUnits]; x1 = x1[:numberOfTimeUnits]; y1 = y1[:numberOfTimeUnits]
     def subPlot(id):
         plt.subplot(id)
         inactivityCorordinates = max(zip(x1, y1), key=itemgetter(1))
         plt.semilogx(x1, y1, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label'] + ' (Update frequency=%d TU)' % inactivityCorordinates[0]), lw=2)
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.semilogx([inactivityCorordinates[0]], [inactivityCorordinates[1]], 'o', alpha=0.7, color='r')
         plt.subplot(id).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 3)))
         plt.yticks((min(y1), max(y1)))
         print self.stream_settings['plot_label'], inactivityCorordinates[0]
     plt.subplot(311)
     plt.title(getLatexForString('Dimensions update frequency estimation'))
     plt.semilogx(x, y, '-', color=self.stream_settings['plot_color'], label=getLatexForString(self.stream_settings['plot_label']), lw=2)
     plt.subplot(311).yaxis.set_major_formatter(FuncFormatter(lambda x, i: '%0.1f' % (x / 10. ** 5)))
     plt.text(0.0, 1.01, getLatexForString('10^5'), transform=plt.gca().transAxes)
     plt.ylabel(getLatexForString('\# of decayed dimensions'))
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(312)
     else: subPlot(313); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(getLatexForString('Rate of DD (10^3)'))
     plt.legend(loc=3)
     if returnAxisValuesOnly: plt.show()

Beispiel #18

0

Datei anzeigen

Datei: models_1.py Projekt: kykamath/spam_model

    def trendCurves(iterationData=None, experimentFileName=None):
        if iterationData: 
            currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData
            experimentFileName = conf['experimentFileName']
            if not finalCall:
                topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics)
#                print currentTimeStep
                FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName)
            else:
                iterationInfo  = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold],
                      'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics),
                      'conf': conf}
                del iterationInfo['conf']['spamDectectionMethod']
                FileIO.writeToFileAsJson(iterationInfo, experimentFileName)
        else:
            topicsDataX = defaultdict(list)
            topicsDataY = defaultdict(list)
            for data in FileIO.iterateJsonFromFile(experimentFileName):
                if 'conf' not in data:
                    for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep'])
                else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics']
            for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0)
            plt.figure()
            for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0)
            plt.ylabel('Number of Contents', fontsize=16, fontweight='bold')
            plt.show()

Beispiel #19

0

Datei anzeigen

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = {
         'k_means': {
             'x': [],
             'y': []
         },
         'mr_k_means': {
             'x': [],
             'y': []
         },
         'streaming_lsh': {
             'x': [],
             'y': []
         }
     }
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    dataToPlot[k]['y'],
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     plt.legend(loc=4)
     plt.xlabel(getLatexForString('\# of documents'))
     plt.ylabel(getLatexForString('Running time (s)'))
     plt.title(
         getLatexForString(
             'Running time comparsion for Streaing LSH with k-Means'))
     plt.xlim(xmax=500000)
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithKMeans.pdf')

Beispiel #20

0

Datei anzeigen

Datei: plots_nov_12.py Projekt: kykamath/hashtags_and_geo

 def ef_plot():
     output_file = fld_data_analysis_results%GeneralMethods.get_method_id()+'.png'
     data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)]
     ltuo_hashtag_and_entropy_and_focus = map(itemgetter('hashtag', 'entropy', 'focus'), data)
     mf_norm_focus_to_entropies = defaultdict(list)
     for _, entropy, (_, focus) in ltuo_hashtag_and_entropy_and_focus:
         mf_norm_focus_to_entropies[round(focus, 2)].append(entropy)
     plt.figure(num=None, figsize=(6,3))
     x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies))
                                 for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems()
                                 if len(entropies)>0])
     plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
     plt.scatter(x_focus, y_entropy, s=50, lw=0, c='k')
     plt.xlim(xmin=-0.1, xmax=1.1)
     plt.ylim(ymin=-1, ymax=9)
     plt.xlabel('Mean hashtag focus')
     plt.ylabel('Mean hashtag entropy')
     plt.grid(True)
     savefig(output_file)
     ltuo_hashtag_and_r_entropy_and_focus =\
                                         sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True)
     ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2))
     hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0]
     print list(hashtags[:20])
     print list(reversed(hashtags))[:20]

Beispiel #21

0

Datei anzeigen

Datei: __init__.py Projekt: kykamath/users_and_geo

 def getStats(spotsFile, userToLocationVector):
     lidToSpotIdMap, userToSpotIdMap, spotMap, spotsWithUsersFile = {}, {}, defaultdict(dict), spotsFile + "_users"
     for spot in FileIO.iterateJsonFromFile(spotsWithUsersFile):
         for location, _ in spot["lids"]:
             lidToSpotIdMap[getLidFromLocation(location)] = spot["id"]
         for user in spot["users"]:
             userToSpotIdMap[user] = spot["id"]
     observedUsers = set()
     for userVector in userToLocationVector:
         user = userVector["user"]
         assert user not in observedUsers
         if user in userToSpotIdMap:
             assignment = [
                 [lidToSpotIdMap[lid]] * userVector["locations"][lid]
                 for lid in userVector["locations"]
                 if lid in lidToSpotIdMap
             ]
             spotMap[userToSpotIdMap[user]][user] = [item for t in assignment for item in t]
             observedUsers.add(user)
     accuracyList = []
     for spotId, userMap in spotMap.iteritems():
         totalAssignments, wrongAssignments = 0.0, 0.0
         for user in userMap:
             for a in userMap[user]:
                 if a != spotId:
                     wrongAssignments += 1
                 totalAssignments += 1
         accuracyList.append(wrongAssignments / totalAssignments)
     return {
         "accuracy": np.mean(accuracyList),
         "total_locations": len(lidToSpotIdMap),
         "total_users": len(userToSpotIdMap),
     }

Beispiel #22

0

Datei anzeigen

 def plotICDFDimensionsInactivityThreshold(self, returnAxisValuesOnly=True):
     ''' Plot P(in_actiivty > threshold timeunit)
         Find time unit at which probability is low.
         Experts stream 0.25 129
         Houston stream 0.25 144
     '''
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(
             FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile)):
         data = dict((int(k), v) for k, v in line[
             ParameterEstimation.dimensionInActivityTimeId].iteritems())
         total.append(sum(data.values()))
         for i in data:
             dataY[i].append(data[i])
             dataX.add(i)
     totalInstancesObserved = float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution(
         [sum(dataY[k]) / totalInstancesObserved for k in x])
     plt.plot(x,
              y,
              label=getLatexForString(self.stream_settings['plot_label']),
              color=self.stream_settings['plot_color'],
              lw=2)
     plt.ylabel(
         r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'
     ), plt.xlabel(
         getLatexForString('Inactivity duration threshold')), plt.title(
             getLatexForString('Inactivity analysis for dimensions.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Beispiel #23

0

Datei anzeigen

Datei: plots_nov_12.py Projekt: kykamath/hashtags_and_geo

    def iid_vs_cumulative_distribution_and_peak_distribution():
        TIME_UNIT_IN_SECONDS = 10.*60.
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        ltuo_iid_and_interval_stats = [data for data in 
                                        FileIO.iterateJsonFromFile(f_iid_spatial_metrics, remove_params_dict=True)]
        ltuo_s_iid_and_interval_stats = sorted(ltuo_iid_and_interval_stats, key=itemgetter(0))
        ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences = [(data[0], (data[1][0], data[1][2])) for data in ltuo_s_iid_and_interval_stats]
        total_peaks = sum([data[1][0] for data in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences])+0.0
        x_iids = []
        y_is_peaks = []
        z_cumulative_percentage_of_occurrencess = []
        for (iid, (is_peak, cumulative_percentage_of_occurrences)) in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences[:100]: 
            print (iid, (is_peak, cumulative_percentage_of_occurrences)) 
            x_iids.append((iid+1)*TIME_UNIT_IN_SECONDS/60)
            y_is_peaks.append(is_peak/total_peaks)
            z_cumulative_percentage_of_occurrencess.append(cumulative_percentage_of_occurrences)
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, y_is_peaks, marker='o', c='k')
        plt.ylabel('Distribution of hashtags')
        plt.xlabel('Hashtag peak (minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'peaks')
        plt.clf()
        plt.figure(num=None, figsize=(6,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, z_cumulative_percentage_of_occurrencess, lw=0, marker='o', c='k')
#        plt.xlabel('Minutes')
        plt.ylabel('CDF of occurrences')
        plt.xlabel('Time (Minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'cdf_occurrences_peak')

Beispiel #24

0

Datei anzeigen

Datei: spots_user_based.py Projekt: kykamath/users_and_geo

def drawKMLsForUserCooccurenceGraph(minEdgeWeight=30):
    kml = SpotsKML()
    i=1
    for edge in FileIO.iterateJsonFromFile(locationGraph):
        if edge['w']>=minEdgeWeight: kml.addLine(getLocationPairs(edge['e']), description=str(edge['w'])); i+=1
        if i==10000: break
    kml.write(userBasedSpotsKmlsFolder+'%s.kml'%minEdgeWeight)

Beispiel #25

0

Datei anzeigen

Datei: data_generation_and_crowd_analysis.py Projekt: ylaron/hd_streams_clustering

    def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30)):
#    def iterateExpertClusters(startingDay=datetime(2011,3,19), endingDay=datetime(2011,4,7)):
        while startingDay<=endingDay:
            for line in FileIO.iterateJsonFromFile(experts_twitter_stream_settings.lsh_clusters_folder+FileIO.getFileByDay(startingDay)): 
                currentTime = getDateTimeObjectFromTweetTimestamp(line['time_stamp'])
                for clusterMap in line['clusters']: yield (currentTime, TwitterCrowdsSpecificMethods.getClusterFromMapFormat(clusterMap))
            startingDay+=timedelta(days=1)

Beispiel #26

0

Datei anzeigen

Datei: quality_comparison_with_kmeans.py Projekt: greeness/hd_streams_clustering

 def getClusteringQuality():
     '''
     no_of_documents: 300000
     k_means
         f1, p, r ['(0.95 0.04)', '(0.95 0.04)', '(0.95 0.04)']
         purity (0.95 0.04)
         nmi (0.94 0.04)
     streaming_lsh
         f1, p, r ['(0.67 0.01)', '(0.71 0.01)', '(0.64 0.02)']
         purity (0.96 0.00)
         nmi (0.87 0.00)
     '''
     del plotSettings['mr_k_means']; del plotSettings['default_streaming_lsh']
     speedStats = dict([(k, {'f1': [], 'nmi': [], 'purity': []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in speedStats:
             for metric in speedStats['k_means']: speedStats[k][metric].append(data[k][metric])
     # Adding this because final value of f1 is 0 instead of tuple at 300K documents.
     speedStats['k_means']['f1'][-1]=[0.,0.,0.]
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems(): 
         print k
         for k1,v1 in v.iteritems(): 
             if type(v1[0])!=type([]): print k1, '(%0.2f %0.2f)'%(np.mean(v1), np.var(v1)); dataForPlot[k]+=[np.mean(v1)]
             else: print k1, ['(%0.2f %0.2f)'%(np.mean(z),np.var(z)) for z in zip(*v1)]; dataForPlot[k]+=[np.mean(z) for z in zip(*v1)]

Beispiel #27

0

Datei anzeigen

Datei: parser.py Projekt: kykamath/stack_exchange_parser

def sortFile(fileName):
    dataToWrite = sorted(FileIO.iterateJsonFromFile(fileName), key=lambda l: dateutil.parser.parse(l['CreationDate']))
    print 'Sorting file', fileName
    f = open('%s_sorted'%fileName, 'w')
    for line in dataToWrite: f.write(cjson.encode(line)+'\n')
    os.system('mv %s_sorted %s'%(fileName, fileName))
    f.close()

Beispiel #28

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: greeness/hd_streams_clustering

 def plotCDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     This determines the time after which a cluster can be considered 
     decayed and hence removed.
     
     Experts stream [ 0.66002386  0.07035227] 0.1 82
     Houston stream [ 0.73800037  0.05890473] 0.1 29
     
     458 (# of time units) Experts stream [ 0.66002386  0.07035227] 0.2 15
     71 (# of time units) Houston stream [ 0.73756656  0.05883258] 0.2 3
     
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]))[-1]
     total = float(sum(data['lag_between_streams_added_to_cluster'].values()))
     x = sorted(map(int, data['lag_between_streams_added_to_cluster'].keys()))
     y = getCumulativeDistribution([data['lag_between_streams_added_to_cluster'][str(i)] / total for i in x])
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.2) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for clusters lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Beispiel #29

0

Datei anzeigen

Datei: analysis.py Projekt: kykamath/hashtags_and_geo

 def print_dense_utm_ids():
     ''' Prints list of dense utm_ids.
     '''
     print [utm_object['utm_id'] 
            for utm_object in FileIO.iterateJsonFromFile(
                                                 f_hashtags_by_utm_id,
                                                 remove_params_dict=True)]

Beispiel #30

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: greeness/hd_streams_clustering

    def analyzeJustifyExponentialDecay(self):
        global evaluation
        experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}}
        for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data['iteration_parameters']['type']][getDateTimeObjectFromTweetTimestamp(data['iteration_parameters']['current_time'])]=data['clusters']
        qualityData = []
        for k1, k2 in zip(sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay])):
            qualityData.append((k1, evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.with_decay][k1], None, None)['purity']-evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.without_decay][k1], None, None)['purity']))
        keyTime = sorted(qualityData, key=itemgetter(1))[-1][0]
        clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i)>=3]
        clusterWithOutDecay = [i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i)>=3]
#        for c in clusterWithDecay:
#            print c, [evaluation.expertsToClassMap[i.lower()] for i in c]

        interestedCluster = set(['Zap2it', 'ESPNAndyKatz', 'comingsoonnet', '950KJR', 'ginasmith888', 'UKCoachCalipari', 'SportsFanz', 'David_Henrie'])
        for c in clusterWithOutDecay:
            if len(set(c).intersection(interestedCluster))>0: 
#                print c, [evaluation.expertsToClassMap[i.lower()] for i in c]
                setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]).replace(' ', '\\ ').replace('_', '\\_')
                print keyTime, '&', setString, '\\\\'
            
        clustersDiscoveredEarlierByDecay = {}
        for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]):
            for c in experimentsData[JustifyExponentialDecay.with_decay][kt]:
                c=sorted(c)
                if len(set(c).intersection(interestedCluster))>0: 
                    classes = [evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap]
                    if sorted([(k, len(list(g))/float(len(classes))) for k,g in groupby(sorted(classes))], key=itemgetter(1))[-1][1]>0.7:
                        if kt>datetime(2011,3,19) and kt<=keyTime: clustersDiscoveredEarlierByDecay[kt]=c
        observedStrings = set()
        for k in sorted(clustersDiscoveredEarlierByDecay): 
            setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k])]).replace(' ', '\\ ').replace('_', '\\_')
            if setString not in observedStrings: print k, '&', setString, '\\\\'; observedStrings.add(setString)

Beispiel #31

0

Datei anzeigen

Datei: update_remote_memcache.py Projekt: kykamath/GAE_tutorial

    def load_mf_hashtag_to_ltuo_point_and_occurrence_time(WINDOW_IN_MINUTES):
        def is_unicode(hashtag):
            try:
                hashtag.decode('ascii')
            except Exception: return False
            return True
        mf_hashtag_to_ltuo_point_and_occurrence_time = defaultdict(list)
        # Subtracting because stream appears to be delayed by an hour
#        dt_current_time = datetime.fromtimestamp(time.mktime(time.gmtime(time.time()))) - timedelta(hours=1)
        dt_current_time = datetime.fromtimestamp(time.mktime(time.gmtime(time.time())))
        td_interval = timedelta(seconds=INTERVAL_IN_MINUTES * 60)
        td_window = timedelta(seconds= WINDOW_IN_MINUTES * 60)
        dt_next_time = dt_current_time - td_window
        while dt_next_time < dt_current_time:
            f_input = GetOutputFile(dt_next_time)
            if os.path.exists(f_input):
                print 'Processing:', f_input
                for checkin in FileIO.iterateJsonFromFile(f_input):
                    for hashtag, point_and_occurrence_time in \
                            TweetStreamDataProcessing._ParseHashtagObjects(checkin):
                        if hashtag not in BLOCKED_HASHTAGS and is_unicode(hashtag):
                            mf_hashtag_to_ltuo_point_and_occurrence_time[hashtag].append(point_and_occurrence_time)
            dt_next_time += td_interval
#        return mf_hashtag_to_ltuo_point_and_occurrence_time
        # Modifying this code to normalize lattice and time for all analysis
        mf_hashtag_to_normalized_ltuo_point_and_occurrence_time = {}
        for hashtag, ltuo_point_and_occurrence_time in \
                mf_hashtag_to_ltuo_point_and_occurrence_time.iteritems():
            ltuo_point__lattice__normalized_occurrence_time = \
                SpatialAnalysisAlgorithms._get_ltuo_point_and_lattice_and_normalized_occurrence_time(ltuo_point_and_occurrence_time)
            ltuo_point__lattice__normalized_occurrence_time = SpatialAnalysisAlgorithms._get_valid_occurrences(ltuo_point__lattice__normalized_occurrence_time)
            
            mf_hashtag_to_normalized_ltuo_point_and_occurrence_time[hashtag] \
                = [(lattice, normalized_occurrence_time) for _, lattice, normalized_occurrence_time in ltuo_point__lattice__normalized_occurrence_time]
        return mf_hashtag_to_normalized_ltuo_point_and_occurrence_time

Beispiel #32

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: ylaron/hd_streams_clustering

    def analyzeJustifyExponentialDecay(self):
        global evaluation
        experimentsData = {JustifyExponentialDecay.with_decay: {}, JustifyExponentialDecay.without_decay: {}}
        for data in FileIO.iterateJsonFromFile(JustifyExponentialDecay.stats_file): experimentsData[data['iteration_parameters']['type']][getDateTimeObjectFromTweetTimestamp(data['iteration_parameters']['current_time'])]=data['clusters']
        qualityData = []
        for k1, k2 in zip(sorted(experimentsData[JustifyExponentialDecay.with_decay]), sorted(experimentsData[JustifyExponentialDecay.without_decay])):
            qualityData.append((k1, evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.with_decay][k1], None, None)['purity']-evaluation.getEvaluationMetrics(experimentsData[JustifyExponentialDecay.without_decay][k1], None, None)['purity']))
        keyTime = sorted(qualityData, key=itemgetter(1))[-1][0]
        clusterWithDecay = [i for i in experimentsData[JustifyExponentialDecay.with_decay][keyTime] if len(i)>=3]
        clusterWithOutDecay = [i for i in experimentsData[JustifyExponentialDecay.without_decay][keyTime] if len(i)>=3]
#        for c in clusterWithDecay:
#            print c, [evaluation.expertsToClassMap[i.lower()] for i in c]

        interestedCluster = set(['Zap2it', 'ESPNAndyKatz', 'comingsoonnet', '950KJR', 'ginasmith888', 'UKCoachCalipari', 'SportsFanz', 'David_Henrie'])
        for c in clusterWithOutDecay:
            if len(set(c).intersection(interestedCluster))>0: 
#                print c, [evaluation.expertsToClassMap[i.lower()] for i in c]
                setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(c)]).replace(' ', '\\ ').replace('_', '\\_')
                print keyTime, '&', setString, '\\\\'
            
        clustersDiscoveredEarlierByDecay = {}
        for kt in sorted(experimentsData[JustifyExponentialDecay.with_decay]):
            for c in experimentsData[JustifyExponentialDecay.with_decay][kt]:
                c=sorted(c)
                if len(set(c).intersection(interestedCluster))>0: 
                    classes = [evaluation.expertsToClassMap[i.lower()] for i in c if i.lower() in evaluation.expertsToClassMap]
                    if sorted([(k, len(list(g))/float(len(classes))) for k,g in groupby(sorted(classes))], key=itemgetter(1))[-1][1]>0.7:
                        if kt>datetime(2011,3,19) and kt<=keyTime: clustersDiscoveredEarlierByDecay[kt]=c
        observedStrings = set()
        for k in sorted(clustersDiscoveredEarlierByDecay): 
            setString = ', '.join(['%s (%s)'%(i, evaluation.expertsToClassMap[i.lower()]) for i in sorted(clustersDiscoveredEarlierByDecay[k])]).replace(' ', '\\ ').replace('_', '\\_')
            if setString not in observedStrings: print k, '&', setString, '\\\\'; observedStrings.add(setString)

Beispiel #33

0

Datei anzeigen

Datei: places1.py Projekt: kykamath/users_and_geo

 def writeClusterKML():
     kml = SpotsKML()
     outputKMLFile='%s/clusters.kml'%placesAnalysisFolder%place['name']
     for data in FileIO.iterateJsonFromFile(placesUserClusterFeaturesFile%place['name']):
         clusterId, color, features = data
         kml.addLocationPointsWithTitles([(getLocationFromLid(f[0].replace('_', ' ')), f[2]) for f in features[:noOfFeatures]], color=color)
         FileIO.createDirectoryForFile(outputKMLFile)
         kml.write(outputKMLFile)

Beispiel #34

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: kykamath/hd_streams_clustering

def loadExperimentsData(experimentsData, file):
    for data in FileIO.iterateJsonFromFile(file):
        if data["purity"] > 0 and data["purity"] < 1:
            experimentsData[data["iteration_parameters"]["type"]]["iteration_time"].append(data["iteration_time"])
            experimentsData[data["iteration_parameters"]["type"]]["quality"].append(data["purity"])
            experimentsData[data["iteration_parameters"]["type"]]["total_clusters"].append(
                data["iteration_parameters"]["total_clusters"]
            )

Beispiel #35

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

    def temp(hashtag='blackparentsquotes'):
        for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'):
#            print hashtagObject['h']
            if hashtagObject['h']==hashtag:
                print hashtagObject['h']
                occsDistributionInTimeUnits = getOccurranceDistributionInEpochs(hashtagObject['oc'], timeUnit=5, fillInGaps=True, occurancesCount=False)
#                plt.plot_date()
                exit()

Beispiel #36

0

Datei anzeigen

Datei: plots.py Projekt: kykamath/hashtags_and_geo

 def writeHashtagsFile():
     hashtags = []
     for hashtagObject in FileIO.iterateJsonFromFile('/mnt/chevron/kykamath/data/geo/hashtags/analysis/all_world/2_11/hashtagsWithoutEndingWindow'):
         print hashtagObject.keys()
         exit()
         hashtags.append(hashtagObject['h'])
     hashtags=sorted(hashtags)
     for h in hashtags: FileIO.writeToFile(unicode(h).encode('utf-8'), 'hashtags')

Beispiel #37

0

Datei anzeigen

 def getClusteringQuality():
     '''
     no_of_documents: 300000
     k_means
         f1, p, r ['(0.95 0.04)', '(0.95 0.04)', '(0.95 0.04)']
         purity (0.95 0.04)
         nmi (0.94 0.04)
     streaming_lsh
         f1, p, r ['(0.67 0.01)', '(0.71 0.01)', '(0.64 0.02)']
         purity (0.96 0.00)
         nmi (0.87 0.00)
     '''
     del plotSettings['mr_k_means']
     del plotSettings['default_streaming_lsh']
     speedStats = dict([(k, {
         'f1': [],
         'nmi': [],
         'purity': []
     }) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.combined_stats_file):
         for k in speedStats:
             for metric in speedStats['k_means']:
                 speedStats[k][metric].append(data[k][metric])
     # Adding this because final value of f1 is 0 instead of tuple at 300K documents.
     speedStats['k_means']['f1'][-1] = [0., 0., 0.]
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, [
                     '(%0.2f %0.2f)' % (np.mean(z), np.var(z))
                     for z in zip(*v1)
                 ]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(
             plt.bar(ind + i * width,
                     dataForPlot[k],
                     width,
                     color=plotSettings[k]['color']))
         i += 1
     plt.ylabel(getLatexForString('Score'))
     plt.title(
         getLatexForString(
             'Clustering quality comparison for Streaming LSH with k-Means')
     )
     plt.xticks(ind + width,
                ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$'))
     plt.legend([r[0] for r in rects],
                [plotSettings[k]['label'] for k in plotSettings],
                loc=4)
     plt.show()

Beispiel #38

0

Datei anzeigen

Datei: mr_location_user_analysis.py Projekt: kykamath/users_and_geo

def getRandomLocationNames(file, **conf):
    data = defaultdict(list)
    for d in FileIO.iterateJsonFromFile(file): data[d['location_db_mad']].append(d['location'])
    for k in sorted(data):
        print k,
        for i in range(5):
            venue = venuesCollection.find_one({'lid':random.choice(data[k])})
            if venue: print unicode(venue['n']).encode("utf-8")+', ',
        print

Beispiel #39

0

Datei anzeigen

Datei: experiments.py Projekt: kykamath/spam_model

def performanceWithSpamFilteringForLatestMessages(generateData):
    experimentData = defaultdict(dict)
    for iteration in range(10):
#        for spammerPercentage in range(1,21):
##            spammerPercentage = 20
#            spammerPercentage = spammerPercentage*0.05
#        for spammerPercentage in range(1,11):
#            spammerPercentage = spammerPercentage*0.02
#        for spammerPercentage in range(1,201):
#            spammerPercentage = spammerPercentage* 0.005
        l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)]
        l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)]
        l3 = [0.01]+l2
        for spammerPercentage in l1:
            experimentFileName = spamModelFolder+'performanceWithSpamFilteringForLatestMessages/%s/%0.3f'%(iteration,spammerPercentage)
            print experimentFileName
            if generateData:
                model = MixedUsersModel()
                conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered],
                        'experimentFileName': experimentFileName,
#                        'noOfPayloadsPerSpammer': 1, 'noOfTopics': 10
                        }
                
#                conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
#                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages],
#                        'experimentFileName': experimentFileName}
                
                GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf)
            else:
                tempData = defaultdict(list)
                for data in FileIO.iterateJsonFromFile(experimentFileName):
                    for ranking_id in data['spammmess']:
                        tempData[ranking_id]+=data['spammmess'][ranking_id]
                experimentData[iteration][spammerPercentage]=tempData
    if not generateData:
        realDataY = defaultdict(dict)
        for iteration in experimentData:
            dataY = defaultdict(list)
            dataX = []
            for perct in sorted(experimentData[iteration]):
                dataX.append(perct)
                for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values))
            dataX=sorted(dataX)
            for ranking_id in dataY:
                for x, y in zip(dataX, dataY[ranking_id]): 
                    if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] 
                    realDataY[ranking_id][x].append(y)
        for ranking_id in dataY: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id])
        plt.xlabel('Percentage of Spammers', fontsize=16, fontweight='bold')
        plt.ylabel('Spamness', fontsize=16, fontweight='bold')
#        plt.title('Performance with spam filtering')
        plt.legend(loc=2)
#        plt.show()
        plt.xlim(xmax=0.05)
        plt.savefig('performanceWithSpamFilteringForLatestMessages.png')
        plt.clf()

Beispiel #40

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: greeness/hd_streams_clustering

def loadExperimentsData(experimentsData, file):
    for data in FileIO.iterateJsonFromFile(file):
        if data['purity'] > 0 and data['purity'] < 1:
            experimentsData[data['iteration_parameters']
                            ['type']]['iteration_time'].append(
                                data['iteration_time'])
            experimentsData[data['iteration_parameters']
                            ['type']]['quality'].append(data['purity'])
            experimentsData[data['iteration_parameters']
                            ['type']]['total_clusters'].append(
                                data['iteration_parameters']['total_clusters'])

Beispiel #41

0

Datei anzeigen

    def plotDimensionsEstimation(self, returnAxisValuesOnly=True):
        def calculateDimensionsFor(params, percentageOfNewDimensions):
            '''
            numberOfTimeUnits=10*24*12
            Experts stream [  1.17707899e+03   1.03794580e+00] 76819
            Houston stream [  2.73913900e+03   1.02758516e+00] 195731
            '''
            print getSmallestPrimeNumberGreaterThan(
                int(
                    CurveFit.inverseOfDecreasingExponentialFunction(
                        params, percentageOfNewDimensions)))

        dataDistribution = defaultdict(list)
        for line in FileIO.iterateJsonFromFile(self.dimensionsEstimationFile):
            for k, v in line[
                    ParameterEstimation.dimensionsEstimationId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0., 0.]
                dataDistribution[k][0] += v
                dataDistribution[k][1] += 1
        x, y = [], []
        [(x.append(k),
          y.append((dataDistribution[k][0] / dataDistribution[k][1]) / k))
         for k in sorted(dataDistribution) if k > 1000]
        x, y = x[:numberOfTimeUnits], y[:numberOfTimeUnits]
        exponentialCurveParams = CurveFit.getParamsAfterFittingData(
            x, y, CurveFit.decreasingExponentialFunction, [1., 1.])
        print self.stream_settings[
            'plot_label'], exponentialCurveParams, calculateDimensionsFor(
                exponentialCurveParams, 0.01)
        plt.ylabel(getLatexForString('\% of decaying dimensions')), plt.xlabel(
            getLatexForString('\# of dimensions')
        ), plt.title(
            getLatexForString(
                'Dimension stability with increasing number of dimensions.'))
        plt.semilogy(
            x,
            y,
            'o',
            color=self.stream_settings['plot_color'],
            label=getLatexForString(self.stream_settings['plot_label']) +
            getLatexForString(' (%0.2fx^{-%0.2f})') %
            (exponentialCurveParams[0], exponentialCurveParams[1]),
            lw=2)
        plt.semilogy(x,
                     CurveFit.getYValues(
                         CurveFit.decreasingExponentialFunction,
                         exponentialCurveParams, x),
                     color=self.stream_settings['plot_color'],
                     lw=2)
        plt.legend()
        if returnAxisValuesOnly: plt.show()

Beispiel #42

0

Datei anzeigen

def iterateUserDocuments(fileName):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in FileIO.iterateJsonFromFile(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **default_experts_twitter_stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v

Beispiel #43

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: ylaron/hd_streams_clustering

 def plotClusteringQuality():
     del plotSettings['ssa_mr']
     speedStats = dict([(k, {
         'f1': [],
         'nmi': [],
         'purity': []
     }) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in speedStats:
             for metric in speedStats['ssa']:
                 speedStats[k][metric].append(data[k][metric])
     dataForPlot = dict([(k, []) for k in plotSettings])
     for k, v in speedStats.iteritems():
         print k
         for k1, v1 in v.iteritems():
             if type(v1[0]) != type([]):
                 print k1, '(%0.2f %0.2f)' % (np.mean(v1), np.var(v1))
                 dataForPlot[k] += [np.mean(v1)]
             else:
                 print k1, [
                     '(%0.2f %0.2f)' % (np.mean(z), np.var(z))
                     for z in zip(*v1)
                 ]
                 dataForPlot[k] += [np.mean(z) for z in zip(*v1)]
     ind, width = np.arange(5), 0.1
     rects, i = [], 0
     for k in dataForPlot:
         rects.append(
             plt.bar(ind + i * width,
                     dataForPlot[k],
                     width,
                     color=plotSettings[k]['color']))
         i += 1
     plt.ylabel(getLatexForString('Score'))
     plt.title(
         getLatexForString(
             'Clustering quality comparison for Streaming LSH with SSA'))
     plt.xticks(ind + width,
                ('$F$', '$Precision$', '$Recall$', '$Purity$', '$NMI$'))
     plt.legend([r[0] for r in rects],
                [plotSettings[k]['label'] for k in plotSettings],
                loc=4)
     #        plt.show()
     plt.savefig('qualityComparisonWithSSA.pdf')

Beispiel #44

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotPercentageOfDimensionsWithinALag(self, returnAxisValuesOnly=True):
     '''
     This gives us the percentage of phrases we can loose everytime we prune phrases.
     
     Measures the percentage of dimensions having lag less than TU.
     
     So at the end of 10th day, almost y% of phrases can be removed. With some probabiity
     that it will not occure again.
     
     numberOfTimeUnits=10*24*12
     With 75% probability.
     Experts stream [ 0.0097055   0.81888514] 107 0.554497397565
     Houston stream [ 0.00943499  0.825918  ] 126 0.487757815615
     With 90% probability.
     Experts stream [ 0.0097055   0.81888514] 223 0.187150798756
     Houston stream [ 0.00943499  0.825918  ] 228 0.164007589276
     '''
     def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
     dataDistribution = {}
     currentTimeUnit = 0
     for data in list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[:numberOfTimeUnits]:
         totalDimensions = float(sum(data['phrases_lag_distribution'].values()))
         tempArray = []
         for k, v in data['phrases_lag_distribution'].iteritems():
             k = int(k)
             if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
             dataDistribution[k][currentTimeUnit] = v / totalDimensions
             tempArray.append(v / totalDimensions)
         currentTimeUnit += 1
     x = sorted(dataDistribution)
     y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
     params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], params,
     def subPlot(id, timeUnit):
         plt.subplot(id)
         print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
         plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
         plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 107); plt.title(getLatexForString('Percentage of phrases within a lag'))
     else: subPlot(111, 126); plt.xlabel(getLatexForString(xlabelTimeUnits))
     plt.ylabel(r'$\%\ of\ phrases\ with\ lag\ \leq\ TU$')
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Beispiel #45

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotICDFClustersLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Experts stream 0.25 199
     Houston stream 0.25 152
     '''
     self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId] = self.stream_settings['parameter_estimation_folder'] + ClusteringParametersEstimation.clusterLagDistributionId
     dataX, dataY, total = set(), defaultdict(list), []
     for line in list(FileIO.iterateJsonFromFile(self.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])):
         print line.keys()
         data = dict((int(k), v) for k,v in line[ClusteringParametersEstimation.clusterLagDistributionId].iteritems())
         total.append(sum(data.values()))
         for i in data: dataY[i].append(data[i]); dataX.add(i)
     totalInstancesObserved=float(sum(total))
     x = sorted(dataX)
     y = getInverseCumulativeDistribution([sum(dataY[k])/totalInstancesObserved for k in x])
     plt.plot(x, y, label=getLatexForString(self.stream_settings['plot_label']), color=self.stream_settings['plot_color'], lw=2)
     if self.stream_settings['plot_label']=='Houston stream': plt.plot([0,x[-1]], [1, 0], '--', color='#5AF522', lw=2)
     plt.ylabel(r'$P\ (\ inactivity\ duration\ \geq\ \ inactivity\ duration\ threshold )$'), plt.xlabel(getLatexForString('Inactivity duration threshold')), plt.title(getLatexForString('Inactivity analysis for crowds.'))
     plt.legend()
     if returnAxisValuesOnly: plt.show()

Beispiel #46

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: ylaron/hd_streams_clustering

 def plotJustifyDimensionsEstimation(self):
     runningTimeData, purityData = defaultdict(list), defaultdict(list)
     for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file):
         if data['iteration_parameters']['dimensions']<data['no_of_observed_dimensions']:
             no_of_dimensions = data['iteration_parameters']['dimensions']
             runningTimeData[no_of_dimensions].append(data['iteration_time']), purityData[no_of_dimensions].append(data['purity'])
     plt.subplot(111)
     dataX, dataY = [], []
     del purityData[169991]; del purityData[39989]
     plt.title('Impact of dimension estimation')
     for k in sorted(purityData): dataX.append(k), dataY.append(np.mean(purityData[k])) 
     plt.semilogx(dataX, [0.96]*len(dataX), '--', label='Top n dimensions', color='#7109AA', lw=2)
     plt.semilogx(dataX, [np.mean(dataY)]*len(dataX), '--', color='#5AF522', lw=2)
     plt.semilogx(dataX, dataY, '-x', label='Fixed dimensions', color='#5AF522', lw=2)
     plt.ylim(0.8, 1.0)
     plt.xlim(7000, 203000)
     plt.xlabel('# of dimensions')
     plt.ylabel('Purity')
     plt.legend(loc=3)
     plt.savefig('justifyDimensionsEstimation.pdf')

Beispiel #47

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotThresholdForDocumentToBeInCluster(self, statsFile):
     dataToPlot = dict(('%0.2f' % (t * 0.05), {'iteration_time':[], 'purity': [], 'nmi': []}) for t in range(1, 21))
     for data in FileIO.iterateJsonFromFile(statsFile):
         threshold = '%0.2f' % data['settings']['threshold_for_document_to_be_in_cluster']
         for k in dataToPlot[threshold]: dataToPlot[threshold][k] += [data['streaming_lsh'][k]]
     for t in dataToPlot:
         for k in dataToPlot[t]: dataToPlot[t][k] = np.mean(dataToPlot[t][k]) 
     dataX = sorted([float(i) for i in dataToPlot])[:-1]
     print dataX
     # Plot iteration time.
     plt.subplot(211)
     plt.plot(dataX, [dataToPlot['%0.2f' % x]['iteration_time'] for x in dataX], lw=2, color='k')
     plt.ylabel(getLatexForString('Time (s)'))
     plt.title(getLatexForString('Estimation of \epsilon^\prime for Stream SSA'))
     plt.subplot(212)
     for metric, label, color in [('nmi', 'NMI', '#F60018'), ('purity', 'Purity', '#25D500')]: plt.plot(dataX, [dataToPlot['%0.2f' % x][metric] for x in dataX], label=label, color=color, lw=2)
     plt.ylabel(getLatexForString('Score'))
     plt.xlabel(getLatexForString('Similarity threshold (\epsilon^\prime)'))
     plt.legend(loc=4)
     plt.show()

Beispiel #48

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

 def plotCDFDimensionsLagDistribution(self, returnAxisValuesOnly=True):
     '''
     Inactivity time is the time after which there is a high probability that a
     dimension will not appear. Find time_unit that gives this probability. 
     
     Cumulative distribution function (http://en.wikipedia.org/wiki/Cumulative_distribution_function)
     lag = time betweeen occurance of two dimensions (similar to inactivty_time)
     
     F(time_unit) = P(lag<=time_unit)
     time_unit = F_inv(P(lag<=time_unit))
     
     Given P(inactivty_time>time_unit) determine time_unit as shown:
     P(inactivty_time<=time_unit) = 1 - P(inactivty_time>time_unit)
     inactivty_time = F_inv(P(inactivty_time<=time_unit))
     
     numberOfTimeUnits=10*24*12
     
     Experts stream [ 0.23250341  0.250209  ] 0.25 107
     Houston stream [ 0.16948096  0.30751358] 0.25 126
     
     Experts stream [ 0.23250341  0.250209  ] 0.1, 223
     Houston stream [ 0.16948096  0.30751358] 0.1, 228
     
     Compared to other vaues these values are pretty close to each
     other. This is expected. Irrespective of size of the streams,
     the phrases have the same lifetime and hence decay close to each other.
     '''
     def calculateInActivityTimeFor(params, probabilityOfInactivity): return int(CurveFit.inverseOfIncreasingExponentialFunction(params, 1 - probabilityOfInactivity))
     data = list(FileIO.iterateJsonFromFile(self.dimensionInActivityTimeFile))[numberOfTimeUnits]
     total = float(sum(data[ParameterEstimation.dimensionInActivityTimeId].values()))
     x = sorted(map(int, data[ParameterEstimation.dimensionInActivityTimeId].keys()))
     y = getCumulativeDistribution([data[ParameterEstimation.dimensionInActivityTimeId][str(i)] / total for i in x])
     print len(x)
     exponentialCurveParams = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
     print self.stream_settings['plot_label'], exponentialCurveParams, calculateInActivityTimeFor(exponentialCurveParams, 0.1) 
     plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (exponentialCurveParams[0], exponentialCurveParams[1]), color=self.stream_settings['plot_color'])
     plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, exponentialCurveParams, x), color=self.stream_settings['plot_color'], lw=2)
     plt.ylabel(r'$P\ (\ lag\ \leq\ TU\ )$'), plt.xlabel(getLatexForString(xlabelTimeUnits)), plt.title(getLatexForString('CDF for dimension lag distribution.'))
     plt.ylim((0, 1.2))
     plt.legend(loc=4)
     if returnAxisValuesOnly: plt.show()

Beispiel #49

0

Datei anzeigen

Datei: algorithms_performance.py Projekt: ylaron/hd_streams_clustering

    def plotJustifyDimensionsEstimation2(self):
        pltInfo =  {JustifyDimensionsEstimation.top_n_dimension: {'label': 'Temporally significant', 'color': '#7109AA', 'type': '-', 'marker': 'x'}, JustifyDimensionsEstimation.first_n_dimension: {'label': 'By occurrence', 'color': '#5AF522', 'type': '-', 'marker': 'o'}}
#        experimentsData = {JustifyMemoryPruning.with_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}, JustifyMemoryPruning.without_memory_pruning: {'iteration_time': [], 'quality': [], 'total_clusters': []}}
        experimentsData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(dict), JustifyDimensionsEstimation.first_n_dimension: defaultdict(dict)}
        for data in FileIO.iterateJsonFromFile(JustifyDimensionsEstimation.stats_file_2):
#        for data in FileIO.iterateJsonFromFile('temp/dimensions_need_analysis_2'):
#            if 'dimensions' in data['iteration_parameters']: 
            dimension = data['iteration_parameters']['dimensions']
            type = data['iteration_parameters']['type']
            if dimension not in experimentsData[type]: experimentsData[type][dimension] = {'iteration_time': [], 'quality': [], 'total_clusters': []}
            experimentsData[type][dimension]['iteration_time'].append(data['iteration_time']), experimentsData[type][dimension]['quality'].append(data['purity']), experimentsData[type][dimension]['total_clusters'].append(data['no_of_clusters'])
        lshData = dict([(k, np.mean(experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819][k])) for k in experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]])
        del experimentsData[JustifyDimensionsEstimation.top_n_dimension][76819]
        print lshData
        plotData = {JustifyDimensionsEstimation.top_n_dimension: defaultdict(list), JustifyDimensionsEstimation.first_n_dimension: defaultdict(list)}
        for type in experimentsData:
            for dimension in sorted(experimentsData[type]): plotData[type]['dataX'].append(dimension); [plotData[type][k].append(np.mean(experimentsData[type][dimension][k])) for k in experimentsData[type][dimension]]
        plt.subplot(311); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['total_clusters'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['total_clusters']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=1)
        
        plt.subplot(312); 
        for type in experimentsData:
            plt.semilogy([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['iteration_time'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2);
        plt.semilogy([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['iteration_time']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819'), lw=2);
        plt.ylim(ymin=1, ymax=1500)
        plt.legend(loc=2, ncol=2)
        plt.subplot(313); 
        for type in experimentsData:
            plt.plot([x/10**3 for x in plotData[type]['dataX']], movingAverage(plotData[type]['quality'], 4), color=pltInfo[type]['color'], label=pltInfo[type]['label'], lw=2, marker=pltInfo[type]['marker']);
        plt.ylabel('Mean purity per iteration', fontsize=20); 
#        plt.title(getLatexForString('Impact of dimension ranking'))
        plt.xlabel('# number of dimensions $(10^3)$', fontsize=20)
#        plt.plot([x/10**3 for x in plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']], [lshData['quality']]*len(plotData[JustifyDimensionsEstimation.top_n_dimension]['dataX']), '--', color='#FF1300', label=getLatexForString('Top-76819 dimensions'), lw=2);
        plt.ylim(ymin=0.80,ymax=1.0)
        plt.legend()
        plt.savefig('justifyDimensionsEstimation2.png')

Beispiel #50

0

Datei anzeigen

Datei: stream_parameters_estimation.py Projekt: ylaron/hd_streams_clustering

    def plotPercentageOfClustersWithinALag(self, returnAxisValuesOnly=True):
        '''
        458 Experts stream [ 0.01860266  0.70639136] 15 0.874004297177
        80 Houston stream [ 0.0793181   0.47644004] 3 0.866127308876
        '''
        def calculatePercentageOfDecayedPhrasesFor(params, timeUnit): return 1 - CurveFit.increasingExponentialFunction(params, timeUnit)
        dataDistribution = {}
        currentTimeUnit = 0
#        file='/mnt/chevron/kykamath/data/twitter/lsh_crowds/houston_stream/parameter_estimation/cluster_lag_distribution'
        file = self.hdsClustering.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId]
        lines = list(FileIO.iterateJsonFromFile(file))
        numberOfTimeUnits = len(lines)
        for data in lines:
            totalClusters = float(sum(data[ClusteringParametersEstimation.clusterLagDistributionId].values()))
            tempArray = []
            for k, v in data[ClusteringParametersEstimation.clusterLagDistributionId].iteritems():
                k = int(k)
                if k not in dataDistribution: dataDistribution[k] = [0] * numberOfTimeUnits
                dataDistribution[k][currentTimeUnit] = v / totalClusters
                tempArray.append(v / totalClusters)
            currentTimeUnit += 1
        x = sorted(dataDistribution)
        print numberOfTimeUnits,
        y = getCumulativeDistribution([np.mean(dataDistribution[k]) for k in x])
        params = CurveFit.getParamsAfterFittingData(x, y, CurveFit.increasingExponentialFunction, [1., 1.])
        print self.stream_settings['plot_label'], params,
        def subPlot(id, timeUnit):
            plt.subplot(id)
            print timeUnit, calculatePercentageOfDecayedPhrasesFor(params, timeUnit)
            plt.plot(x, y, 'o', label=getLatexForString(self.stream_settings['plot_label']) + getLatexForString(' (%0.2fx^{%0.2f})') % (params[0], params[1]), color=self.stream_settings['plot_color'])
            plt.plot(x, CurveFit.getYValues(CurveFit.increasingExponentialFunction, params, x), color=self.stream_settings['plot_color'], lw=2)
        if self.stream_settings['stream_id'] == 'experts_twitter_stream': subPlot(111, 15); plt.title(getLatexForString('Percentage of clusters within a lag'))
        else: subPlot(111, 3); plt.xlabel(getLatexForString(xlabelTimeUnits))
        plt.ylabel(r'$\%\ of\ clusters\ with\ lag\ \leq\ TU$')
        plt.legend(loc=4)
        if returnAxisValuesOnly: plt.show()

Beispiel #51

0

Datei anzeigen

Datei: quality_comparison_with_ssa.py Projekt: ylaron/hd_streams_clustering

 def plotClusteringSpeed(saveFig=True):
     dataToPlot = dict([(k, {'x': [], 'y': []}) for k in plotSettings])
     for data in FileIO.iterateJsonFromFile(TweetsFile.stats_file):
         for k in plotSettings:
             dataToPlot[k]['x'].append(data[k]['no_of_documents'])
             dataToPlot[k]['y'].append(data[k]['iteration_time'])
     for k in plotSettings:
         plt.loglog(dataToPlot[k]['x'],
                    movingAverage(dataToPlot[k]['y'], 1),
                    label=plotSettings[k]['label'],
                    color=plotSettings[k]['color'],
                    lw=2)
     print dataToPlot['streaming_lsh']['x'][10]
     print dataToPlot['streaming_lsh']['y'][10]
     plt.legend(loc=4)
     if saveFig:
         plt.xlabel(getLatexForString('\# of documents'))
         plt.ylabel(getLatexForString('Running time (s)'))
         plt.title(
             getLatexForString(
                 'Running time comparsion for Streaing LSH with SSA'))
     plt.xlim(xmin=500, xmax=600000)
     #        plt.show()
     if saveFig: plt.savefig('speedComparisonWithSSA.pdf')

Beispiel #52

0

Datei anzeigen

def iteratePerformanceFrom(id):
    for data in FileIO.iterateJsonFromFile(getPerformanceFile(id)):
        del data[id]['clusters']
        yield data[id]

Beispiel #53

0

Datei anzeigen

def fileIterator():
    for id in xrange(20):
        yield FileIO.iterateJsonFromFile(time_to_process_points + '%s' % id)

Beispiel #54

0

Datei anzeigen

def getIterator(id):
    for line in FileIO.iterateJsonFromFile(time_to_process_points +
                                           'stats/%s' % id):
        yield line

Beispiel #55

0

Datei anzeigen

Datei: plots.py Projekt: greeness/hd_streams_clustering

 def kmeans():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_folder + 'combined_stats_file'):
         yield data['k_means']

Beispiel #56

0

Datei anzeigen

Datei: plots.py Projekt: greeness/hd_streams_clustering

 def kmeansmr():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_folder + 'mr_quality_stats'):
         yield data['mr_k_means']

Beispiel #57

0

Datei anzeigen

Datei: plots.py Projekt: greeness/hd_streams_clustering

 def unoptimized():
     for data in FileIO.iterateJsonFromFile(
             hd_clustering_performance_folder + 'cda_unopt'):
         yield data['streaming_lsh']

Beispiel #58

0

Datei anzeigen

Datei: plots.py Projekt: greeness/hd_streams_clustering

 def cdamr():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_ssa_folder + 'quality_stats'):
         yield data['ssa_mr']

Beispiel #59

0

Datei anzeigen

Datei: plots.py Projekt: greeness/hd_streams_clustering

 def cdait():
     for data in FileIO.iterateJsonFromFile(
             clustering_quality_experts_ssa_folder + 'quality_stats'):
         if 'ssa' in data: yield data['ssa']