Beispiel #1
0
def _build_user_graph(name):
    if name.endswith("lang"):
        name = name[: -len("lang")]
        dist = stats.get_language_data(name)
        return charts.PieChart(dist.items(), max_options=8)

    elif name == "dropout":
        data = stats.get_dropout_figures()
        approx_data = stats.approximate(data)
        chart = charts.MultiLineChart(approx_data, data_name="histogram", x_axis=(0.4, 1.0, 0.1), y_axis=(0, 1.0, 0.1))
        chart.add_data("raw", data)
        return chart

    elif name == "prepostdiff":
        data = [r["pre_post_diff"] for r in stats.get_global_rater_stats() if r["n_tests"] > 2 and r["pre_post_diff"]]
        hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.7, x_max=0.7)
        chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(-0.8, 0.8, 0.2))
        chart.add_data("raw", data)
        return chart

    elif name == "abilityjlpt3":
        data = stats.get_user_scores("jlpt 3")
        hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False)
        chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(0.0, 1.0, 0.1))
        chart.add_data("raw", data)
        return chart

    elif name == "abilityjlpt4":
        data = stats.get_user_scores("jlpt 4")
        hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False)
        chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(0.0, 1.0, 0.1))
        chart.add_data("raw", data)
        return chart

    raise KeyError(name)
Beispiel #2
0
def _build_user_graph(name):
    if name.endswith('lang'):
        name = name[:-len('lang')]
        dist = stats.get_language_data(name)
        return charts.PieChart(dist.items(), max_options=8)

    elif name == 'dropout':
        data = stats.get_dropout_figures()
        approx_data = stats.approximate(data)
        chart = charts.MultiLineChart(approx_data,
                                      data_name='histogram',
                                      x_axis=(0.4, 1.0, 0.1),
                                      y_axis=(0, 1.0, 0.1))
        chart.add_data('raw', data)
        return chart

    elif name == 'prepostdiff':
        data = [
            r['pre_post_diff'] for r in stats.get_global_rater_stats()
            if r['n_tests'] > 2 and r['pre_post_diff']
        ]
        hist_data = stats.histogram(data,
                                    n_bins=11,
                                    normalize=False,
                                    x_min=-0.7,
                                    x_max=0.7)
        chart = charts.LineChart(hist_data,
                                 data_name='histogram',
                                 x_axis=(-0.8, 0.8, 0.2))
        chart.add_data('raw', data)
        return chart

    elif name == 'abilityjlpt3':
        data = stats.get_user_scores('jlpt 3')
        hist_data = stats.histogram(data,
                                    x_min=0.0,
                                    x_max=1.0,
                                    normalize=False)
        chart = charts.LineChart(hist_data,
                                 data_name='histogram',
                                 x_axis=(0.0, 1.0, 0.1))
        chart.add_data('raw', data)
        return chart

    elif name == 'abilityjlpt4':
        data = stats.get_user_scores('jlpt 4')
        hist_data = stats.histogram(data,
                                    x_min=0.0,
                                    x_max=1.0,
                                    normalize=False)
        chart = charts.LineChart(hist_data,
                                 data_name='histogram',
                                 x_axis=(0.0, 1.0, 0.1))
        chart.add_data('raw', data)
        return chart

    raise KeyError(name)
Beispiel #3
0
  def statsex(self, objects):

    """
	Do some statistics on a source list
	Return dictionary
    """

    import stats, pstat
    
    # Return if we have no objects
    if len(objects) == 0:
      return 0	 

    # Define dictionary to hold statistics	
    stat = {}

    # Get number of objects
    stat['N'] = str(len(objects))

    # Define list (float) of FWHM values
    fwhm = [ float(obj[7]) for obj in objects ]
 
    # Define list (float) of ELLIPTICITY values
    el = [ float(obj[6]) for obj in objects ]

    # Define list (float) of THETA_IMAGE values
    pa = [ float(obj[5]) for obj in objects ]

    # Define list (float) of 'Stella-like' values
    stella = [ float(obj[9]) for obj in objects ]	

    # Create a histogram of FWHM values of binsize 1 pixel
    hfwhm = stats.histogram(fwhm,40,[0,40])[0]
    
    stat['medianFWHM'] = "%.2f" % stats.median(fwhm)
    stat['meanFWHM']   = "%.2f" % stats.mean(fwhm)
    stat['modeFWHM']   = "%.2f" % float(hfwhm.index(max(hfwhm))+0.5)

    try:	
       stat['stdevFWHM']  = "%.2f" % stats.stdev(fwhm)
    except ZeroDivisionError:
       stat['stdevFWHM'] = '0.00';

    stat['medianEL'] = "%.2f" % stats.median(el)
    stat['meanEL']   = "%.2f" % stats.mean(el)

    try:
      stat['stdevEL']  = "%.2f" % stats.stdev(el)
    except ZeroDivisionError:
      stat['stdevEL']  = '0.00' 

    # Histogram of Ellipticity PA (-180 to 180, bins of 45 deg)
    #stat['histoTHETA'] = stats.histogram(pa,8,[-180,180])[0]

    # Histogram of Stellarity (0 to 1, bins of 0.05)
    #stat['histoStella']  = stats.histogram(stella,20,[0,1.01])[0]   

    return stat
Beispiel #4
0
  def statsex(self, objects):

    """
	Do some statistics on a source list
	Return dictionary
    """

    import stats, pstat
    
    # Return if we have no objects
    if len(objects) == 0:
      return 0	 

    # Define dictionary to hold statistics	
    stat = {}

    # Get number of objects
    stat['N'] = str(len(objects))

    # Define list (float) of FWHM values
    fwhm = [ float(obj[7]) for obj in objects ]
 
    # Define list (float) of ELLIPTICITY values
    el = [ float(obj[6]) for obj in objects ]

    # Define list (float) of THETA_IMAGE values
    pa = [ float(obj[5]) for obj in objects ]

    # Define list (float) of 'Stella-like' values
    stella = [ float(obj[9]) for obj in objects ]	

    # Create a histogram of FWHM values of binsize 1 pixel
    hfwhm = stats.histogram(fwhm,40,[0,40])[0]
    
    stat['medianFWHM'] = "%.2f" % stats.median(fwhm)
    stat['meanFWHM']   = "%.2f" % stats.mean(fwhm)
    stat['modeFWHM']   = "%.2f" % float(hfwhm.index(max(hfwhm))+0.5)

    try:	
       stat['stdevFWHM']  = "%.2f" % stats.stdev(fwhm)
    except ZeroDivisionError:
       stat['stdevFWHM'] = '0.00';

    stat['medianEL'] = "%.2f" % stats.median(el)
    stat['meanEL']   = "%.2f" % stats.mean(el)

    try:
      stat['stdevEL']  = "%.2f" % stats.stdev(el)
    except ZeroDivisionError:
      stat['stdevEL']  = '0.00' 

    # Histogram of Ellipticity PA (-180 to 180, bins of 45 deg)
    #stat['histoTHETA'] = stats.histogram(pa,8,[-180,180])[0]

    # Histogram of Stellarity (0 to 1, bins of 0.05)
    #stat['histoStella']  = stats.histogram(stella,20,[0,1.01])[0]   

    return stat
Beispiel #5
0
def _build_test_graph(name):
    if name == 'mean':
        score_data = stats.get_mean_score_nth_test()
        data = stats.group_by_points(score_data, y_max=1.0, y_min=0.0)
        chart = charts.MultiLineChart(data,
                                      y_axis=(0, 1, 0.1),
                                      data_name='grouped')
        chart.add_data('raw', score_data)
        return chart

    elif name == 'volume':
        user_data = stats.get_users_by_n_tests()
        return charts.LineChart(user_data)

    elif name == 'length':
        return charts.PieChart(stats.get_test_length_volume())

    elif name == 'normtime':
        user_data = stats.get_score_over_norm_time()
        return charts.LineChart(user_data)

    elif name == 'time':
        base_data = stats.get_score_over_time()
        data = stats.approximate(base_data)
        chart = charts.MultiLineChart(data,
                                      y_axis=(0, 1.05, 0.1),
                                      data_name='approximate')
        chart.add_data('raw', base_data)
        two_colours = charts.color_desc(2).split(',')
        three_colours = ','.join(
            (two_colours[0], two_colours[1], two_colours[1]))
        chart['chco'] = three_colours
        return chart

    elif name == 'dropout':
        return charts.LineChart(stats.get_mean_score_by_n_tests())

    elif name == 'firstlast':
        data = stats.get_first_last_test()
        hist_data = stats.histogram(data,
                                    n_bins=11,
                                    normalize=False,
                                    x_min=-0.5,
                                    x_max=0.5)
        chart = charts.LineChart(hist_data,
                                 data_name='histogram',
                                 x_axis=(-0.5, 0.5, 0.1))
        chart.add_data('raw', data)
        return chart

    raise KeyError(name)
Beispiel #6
0
def hist(x, nbins=15):
    """Simple histogram function.

    **x** -- data set as numeric vector
    
    **nbins** -- number of histogram bins

    matlab equiv: HIST
    """
    import stats
    counts, smallest, binsize, extras = stats.histogram(x, nbins)
    graceplot().histoPlot(counts,
                          x_min=smallest,
                          x_max=smallest+len(counts)*binsize,
                          fillcolor=2, edgecolor=1, labeled=0)
Beispiel #7
0
    def get_sentiword_score(self):
        f = open("SentiWordNet/data/SentiWord.json", "r")
        self.sentiwords = cjson.decode(f.readline())
        f.close()

        self.read_user_vector("user_vector/user_vector_new.json")
        self.neg_words = self.get_word_list(
            "SentiWordNet/data/negative_words_list.txt")
        self.pos_words = self.get_word_list(
            "SentiWordNet/data/positive_words_list.txt")

        myht = histogram()

        for user in self.user_vector:
            for tweet in self.user_vector[user]["tweets"]:
                words = self.process_sentence(tweet["text"])
                pnword_score = 0.0
                sentiword_score = 0.0
                sentiword_count = 0
                for word in words:
                    if self.sentiwords.has_key(word):
                        sentiword_score += self.sentiwords[word]
                        sentiword_count += 1
                    if self.pos_words.has_key(word):
                        pnword_score += 1.0
                    elif self.neg_words.has_key(word):
                        pnword_score -= 1.0
                myht.add(pnword_score)
                tweet.update({"pnword_score": pnword_score})
                if sentiword_count != 0:
                    sentiword_score = sentiword_score / float(sentiword_count)
                tweet.update({"sentiword_score": sentiword_score})

            #print self.user_vector[user]["tweets"]

        mean = myht.avg()
        std = myht.std()
        # normalize the positive_negative_word score
        for user in self.user_vector:
            for tweet in self.user_vector[user]["tweets"]:
                tweet["pnword_score"] = (tweet["pnword_score"] - mean) / std
                #print tweet["pnword_score"]

        f = open("user_vector/user_vector_new_2.json", "w")
        for user in self.user_vector:
            json.dump(self.user_vector[user], f)
            f.write("\n")
        f.close()
Beispiel #8
0
def hist(x, nbins=15):
    """Simple histogram function.

    **x** -- data set as numeric vector
    
    **nbins** -- number of histogram bins

    matlab equiv: HIST
    """
    import stats
    counts, smallest, binsize, extras = stats.histogram(x, nbins)
    graceplot().histoPlot(counts,
                          x_min=smallest,
                          x_max=smallest + len(counts) * binsize,
                          fillcolor=2,
                          edgecolor=1,
                          labeled=0)
 def get_sentiword_score(self):
     f = open("SentiWordNet/data/SentiWord.json","r")
     self.sentiwords = cjson.decode(f.readline())
     f.close()
     
     self.read_user_vector("user_vector/user_vector_new.json")
     self.neg_words = self.get_word_list("SentiWordNet/data/negative_words_list.txt")
     self.pos_words = self.get_word_list("SentiWordNet/data/positive_words_list.txt")
     
     myht = histogram()
     
     for user in self.user_vector:
         for tweet in self.user_vector[user]["tweets"]:
             words = self.process_sentence(tweet["text"])
             pnword_score = 0.0
             sentiword_score = 0.0
             sentiword_count = 0
             for word in words:
                 if self.sentiwords.has_key(word):
                     sentiword_score += self.sentiwords[word]
                     sentiword_count += 1
                 if self.pos_words.has_key(word):
                     pnword_score += 1.0
                 elif self.neg_words.has_key(word):
                     pnword_score -= 1.0
             myht.add(pnword_score)
             tweet.update({"pnword_score" : pnword_score})
             if sentiword_count != 0:
                 sentiword_score = sentiword_score/float(sentiword_count)
             tweet.update({"sentiword_score" : sentiword_score})
         
         #print self.user_vector[user]["tweets"]
     
     mean = myht.avg()
     std = myht.std()
     # normalize the positive_negative_word score
     for user in self.user_vector:
         for tweet in self.user_vector[user]["tweets"]:
             tweet["pnword_score"] = (tweet["pnword_score"]-mean)/std
             #print tweet["pnword_score"]
     
     f = open("user_vector/user_vector_new_2.json", "w")
     for user in self.user_vector:
         json.dump(self.user_vector[user], f)
         f.write("\n")
     f.close()
Beispiel #10
0
def _build_test_graph(name):
    if name == "mean":
        score_data = stats.get_mean_score_nth_test()
        data = stats.group_by_points(score_data, y_max=1.0, y_min=0.0)
        chart = charts.MultiLineChart(data, y_axis=(0, 1, 0.1), data_name="grouped")
        chart.add_data("raw", score_data)
        return chart

    elif name == "volume":
        user_data = stats.get_users_by_n_tests()
        return charts.LineChart(user_data)

    elif name == "length":
        return charts.PieChart(stats.get_test_length_volume())

    elif name == "normtime":
        user_data = stats.get_score_over_norm_time()
        return charts.LineChart(user_data)

    elif name == "time":
        base_data = stats.get_score_over_time()
        data = stats.approximate(base_data)
        chart = charts.MultiLineChart(data, y_axis=(0, 1.05, 0.1), data_name="approximate")
        chart.add_data("raw", base_data)
        two_colours = charts.color_desc(2).split(",")
        three_colours = ",".join((two_colours[0], two_colours[1], two_colours[1]))
        chart["chco"] = three_colours
        return chart

    elif name == "dropout":
        return charts.LineChart(stats.get_mean_score_by_n_tests())

    elif name == "firstlast":
        data = stats.get_first_last_test()
        hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.5, x_max=0.5)
        chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(-0.5, 0.5, 0.1))
        chart.add_data("raw", data)
        return chart

    raise KeyError(name)
Beispiel #11
0
print('var:',stats.var(a),stats.var(af))
print('stdev:',stats.stdev(a),stats.stdev(af))
print('sem:',stats.sem(a),stats.sem(af))
print('describe:')
print(stats.describe(l))
print(stats.describe(lf))
print(stats.describe(a))
print(stats.describe(af))
print('\nFREQUENCY')
print('freqtable:')
print('itemfreq:')
print(stats.itemfreq(l))
print(stats.itemfreq(a))
print('scoreatpercentile:',stats.scoreatpercentile(l,40),stats.scoreatpercentile(lf,40),stats.scoreatpercentile(a,40),stats.scoreatpercentile(af,40))
print('percentileofscore:',stats.percentileofscore(l,12),stats.percentileofscore(lf,12),stats.percentileofscore(a,12),stats.percentileofscore(af,12))
print('histogram:',stats.histogram(l),stats.histogram(a))
print('cumfreq:')
print(stats.cumfreq(l))
print(stats.cumfreq(lf))
print(stats.cumfreq(a))
print(stats.cumfreq(af))
print('relfreq:')
print(stats.relfreq(l))
print(stats.relfreq(lf))
print(stats.relfreq(a))
print(stats.relfreq(af))
print('\nVARIATION')
print('obrientransform:')
l = range(1,21)
a = N.array(l)
ll = [l]*5
Beispiel #12
0
    def draw_distributions(self):
        """
        draw distributions for all terms in self.terms and save figs to specified folders
        """
        f = open("topics/term_senti_scores.json", "r")
        self.terms = cjson.decode(f.readline())
        f.close()
        print len(self.terms)
        count = 0
        for term in self.terms:
            hsw = stats.histogram()
            #hpn = stats.histogram()
            hst = stats.histogram()
            has = stats.histogram()
            for s in self.terms[term]["sentiword_score"]:
                hsw.add(s)
            #for s in self.terms[term]["pnword_score"]:
            #    hpn.add(s)
            for s in self.terms[term]["sentiment_score"]:
                hst.add(s)
            for s in self.terms[term]["avg_sentiscore"]:
                has.add(s)

            distribution_sw = hsw.histogram_2()
            #distribution_pn = hpn.histogram_2()
            distribution_st = hst.histogram_2()
            distribution_as = has.histogram_2()

            # sentiword score
            x_axis = range(len(distribution_sw))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_sw,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Sentiword Score")
            plt.ylabel("Percentage (%)")
            plt.title("Sentiword Score Distribution - %s" % term)
            plt.savefig(sentiword_folder + "%s.png" % term, dpi=50)
            plt.clf()
            """
                # positive negative score
                x_axis = range(len(distribution_pn))
                rcParams['figure.figsize'] = 24, 5
                x_ = [-1.025 + i*0.05 for i in range(42)]
                plt.bar(x_axis, distribution_pn, width=0.8, facecolor='blue', alpha = 0.5)
                X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
                for i in range(len(X_ticks)):
                    if (i-1)%2 != 0:
                        X_ticks[i] = ""
                plt.xticks(x_axis, X_ticks)
                plt.xlim(0, len(x_axis))
                plt.grid(True)
                plt.xlabel("Positive-Negative Word Score")
                plt.ylabel("Percentage (%)")
                plt.title("PNWord Score Distribution - %s" %term)
                plt.savefig(pnword_folder+"%s.png" %term,dpi=50)
                plt.clf()
                """
            # sentiment score
            x_axis = range(len(distribution_st))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_st,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Sentiment Score")
            plt.ylabel("Percentage (%)")
            plt.title("Sentiment Score Distribution - %s" % term)
            plt.savefig(sentiment_folder + "%s.png" % term, dpi=50)
            plt.clf()

            # sentiscore score
            x_axis = range(len(distribution_as))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_as,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Avg Senti Score")
            plt.ylabel("Percentage (%)")
            plt.title("Senti Score Distribution - %s" % term)
            plt.savefig(sentiscore_folder + "%s.png" % term, dpi=50)
            plt.clf()
            count += 1
            print "done %s: neutral_st %d neutral_sw %d neutral_as %d count %d" % (
                term, hst.count_zero(), hsw.count_zero(), has.count_zero(),
                count)
Beispiel #13
0
print 'tstdev:',stats.tstdev(a,(5,17)),stats.tstdev(af,(5,17))
print 'tsem:',stats.tsem(a,(5,17)),stats.tsem(af,(5,17))
print 'describe:'
print stats.describe(l)
print stats.describe(lf)
print stats.describe(a)
print stats.describe(af)

print '\nFREQUENCY'
print 'freqtable:'
print 'itemfreq:'
print stats.itemfreq(l)
print stats.itemfreq(a)
print 'scoreatpercentile:',stats.scoreatpercentile(l,40),stats.scoreatpercentile(lf,40),stats.scoreatpercentile(a,40),stats.scoreatpercentile(af,40)
print 'percentileofscore:',stats.percentileofscore(l,12),stats.percentileofscore(lf,12),stats.percentileofscore(a,12),stats.percentileofscore(af,12)
print 'histogram:',stats.histogram(l),stats.histogram(a)
print 'cumfreq:'
print stats.cumfreq(l)
print stats.cumfreq(lf)
print stats.cumfreq(a)
print stats.cumfreq(af)
print 'relfreq:'
print stats.relfreq(l)
print stats.relfreq(lf)
print stats.relfreq(a)
print stats.relfreq(af)

print '\nVARIATION'
print 'obrientransform:'

l = range(1,21)
 def draw_distributions(self):
     """
     draw distributions for all terms in self.terms and save figs to specified folders
     """
     f = open("topics/term_senti_scores.json","r")
     self.terms = cjson.decode(f.readline())
     f.close()
     print len(self.terms)
     count = 0
     for term in self.terms:
             hsw = stats.histogram()
             #hpn = stats.histogram()
             hst = stats.histogram()
             has = stats.histogram()
             for s in self.terms[term]["sentiword_score"]:
                 hsw.add(s)
             #for s in self.terms[term]["pnword_score"]:
             #    hpn.add(s)
             for s in self.terms[term]["sentiment_score"]:
                 hst.add(s)
             for s in self.terms[term]["avg_sentiscore"]:
                 has.add(s)
                 
             distribution_sw = hsw.histogram_2()
             #distribution_pn = hpn.histogram_2()
             distribution_st = hst.histogram_2()
             distribution_as = has.histogram_2()
                 
             # sentiword score
             x_axis = range(len(distribution_sw))
             rcParams['figure.figsize'] = 24, 5
             x_ = [-1.025 + i*0.05 for i in range(42)]
             plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha = 0.5)
             X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
             for i in range(len(X_ticks)):
                 if (i-1)%2 != 0:
                     X_ticks[i] = ""
             plt.xticks(x_axis, X_ticks)
             plt.xlim(0, len(x_axis))
             plt.grid(True)
             plt.xlabel("Sentiword Score")
             plt.ylabel("Percentage (%)")
             plt.title("Sentiword Score Distribution - %s" %term)
             plt.savefig(sentiword_folder+"%s.png" %term,dpi=50)
             plt.clf()
             """
             # positive negative score
             x_axis = range(len(distribution_pn))
             rcParams['figure.figsize'] = 24, 5
             x_ = [-1.025 + i*0.05 for i in range(42)]
             plt.bar(x_axis, distribution_pn, width=0.8, facecolor='blue', alpha = 0.5)
             X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
             for i in range(len(X_ticks)):
                 if (i-1)%2 != 0:
                     X_ticks[i] = ""
             plt.xticks(x_axis, X_ticks)
             plt.xlim(0, len(x_axis))
             plt.grid(True)
             plt.xlabel("Positive-Negative Word Score")
             plt.ylabel("Percentage (%)")
             plt.title("PNWord Score Distribution - %s" %term)
             plt.savefig(pnword_folder+"%s.png" %term,dpi=50)
             plt.clf()
             """
             # sentiment score
             x_axis = range(len(distribution_st))
             rcParams['figure.figsize'] = 24, 5
             x_ = [-1.025 + i*0.05 for i in range(42)]
             plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha = 0.5)
             X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
             for i in range(len(X_ticks)):
                 if (i-1)%2 != 0:
                     X_ticks[i] = ""
             plt.xticks(x_axis, X_ticks)
             plt.xlim(0, len(x_axis))
             plt.grid(True)
             plt.xlabel("Sentiment Score")
             plt.ylabel("Percentage (%)")
             plt.title("Sentiment Score Distribution - %s" %term)
             plt.savefig(sentiment_folder+"%s.png" %term,dpi=50)
             plt.clf()
                 
             # sentiscore score
             x_axis = range(len(distribution_as))
             rcParams['figure.figsize'] = 24, 5
             x_ = [-1.025 + i*0.05 for i in range(42)]
             plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha = 0.5)
             X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
             for i in range(len(X_ticks)):
                 if (i-1)%2 != 0:
                     X_ticks[i] = ""
             plt.xticks(x_axis, X_ticks)
             plt.xlim(0, len(x_axis))
             plt.grid(True)
             plt.xlabel("Avg Senti Score")
             plt.ylabel("Percentage (%)")
             plt.title("Senti Score Distribution - %s" %term)
             plt.savefig(sentiscore_folder+"%s.png" %term, dpi=50)
             plt.clf()
             count += 1
             print "done %s: neutral_st %d neutral_sw %d neutral_as %d count %d" %(term, hst.count_zero(), hsw.count_zero(), has.count_zero(), count)
Beispiel #15
0
print stats.describe(lf)
print stats.describe(a)
print stats.describe(af)

print '\nFREQUENCY'
print 'freqtable:'
print 'itemfreq:'
print stats.itemfreq(l)
print stats.itemfreq(a)
print 'scoreatpercentile:', stats.scoreatpercentile(
    l, 40), stats.scoreatpercentile(lf, 40), stats.scoreatpercentile(
        a, 40), stats.scoreatpercentile(af, 40)
print 'percentileofscore:', stats.percentileofscore(
    l, 12), stats.percentileofscore(lf, 12), stats.percentileofscore(
        a, 12), stats.percentileofscore(af, 12)
print 'histogram:', stats.histogram(l), stats.histogram(a)
print 'cumfreq:'
print stats.cumfreq(l)
print stats.cumfreq(lf)
print stats.cumfreq(a)
print stats.cumfreq(af)
print 'relfreq:'
print stats.relfreq(l)
print stats.relfreq(lf)
print stats.relfreq(a)
print stats.relfreq(af)

print '\nVARIATION'
print 'obrientransform:'

l = range(1, 21)
def read_data_in_range(filename = "./", topschoolfile = "./", start_year = 2000, end_year = 2014, self_edge = True):
    """
    @description: read the recent data back until specified <cutting_year>
    
    @type filename: string
    @param filename: input file path and name
    
    @type start_year: integer
    @param start_year: the earliest year to be considered

    @type end_year: integer
    @param end_year: the latest year to be considered
    
    @type self_edge: Boolean
    @param self_edge: whether self edges are included or not; True-yes, False-not
    
    @return: list of nodes
    @return: list of edges
    """
    top_50 = []
    f = open(topschoolfile,"r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()
    
    s = {}
    edge_list_all = []
    f = open(filename,"r")
    f.readline() # skip the first row
    for line in f:
        line = line.lower()
        line = line.strip() # remove those "\r\n"
        lines = line.split(",") ## subject to change
        
        if len(lines) == 2 or len(lines) == 3:
#             if lines[0].strip() in top_50 and lines[1].strip() in top_50:
                edge = []
                for i in range(2):
                    edge.append(lines[i].strip())
                    if s.has_key(lines[i].strip()):
                        s[lines[i].strip()] += 1
                    else:
                        s.update({lines[i].strip() : 1})
                if len(lines) == 2: # without year data
                    edge.append("-") ## never enter this loop
                else: 
                    #print lines
                    if len(lines[2]) > 0: # with year data
                        edge.append(lines[2].strip())
                    else: # without year data
                        pass
                edge_list_all.append(edge)
    f.close()
    
    ## statistical analysis
    hist = stats.histogram()
    stat = {}
    cnt = 0
    ## re-organize the edge with weights
    edge_dict = {}
    for edge in edge_list_all:
        if len(edge) == 3 and int(edge[2]) >= start_year and int(edge[2]) <= end_year: ## filtering the recent faculty data
            cnt += 1
            key = edge[0]+"#"+edge[1]
            
            hist.add(edge[2].strip())
            
            if not stat.has_key(edge[0]):
                stat.update({edge[0] : {'total' : 1, 'wyear' : 1}})
            else:
                stat[edge[0]]['total'] += 1
                stat[edge[0]]['wyear'] += 1
            
            if edge_dict.has_key(key):
                edge_dict[key] += 1.0
            else:
                edge_dict.update({key : 1.0})
        else:
            if not stat.has_key(edge[0]):
                stat.update({edge[0] : {'total' : 1, 'wyear' : 0}})
            else:
                stat[edge[0]]['total'] += 1

#     # statistics
#     index, dist, cdf = hist.cdf()
#     print hist._max, hist._min
#     print len(index), index
#     print len(dist), dist
#     print len(cdf), cdf
#     
#     f = open("../result/result_top50_cs_newdata_apr09/year_statistical_from%d_to%d_extended.csv" %(start_year, end_year),"w")
#     f.write("univ,total,wyear\n")
#     for key in stat:
#         f.write("%s,%d,%d\n" %(key, stat[key]['total'], stat[key]['wyear']))
#     f.close()
#     
#     # the CDF of year distribution
#     f = open("../result/result_top50_cs_newdata_apr09/year_cdf_from%d_to%d_extended.csv" %(start_year, end_year),"w")
#     f.write("year,freq,percentile\n")
#     for i in range(len(index)):
#         f.write("%s,%d,%.3f\n" %(index[i], int(dist[i]), cdf[i]))
#     f.close()


    edge_list = []
    for item in edge_dict.iteritems():
        edge = []
        univs = item[0].split("#")
        if not self_edge == True:
            if not univs[0].strip() == univs[1].strip():
                edge.append(univs[0].strip())
                edge.append(univs[1].strip())
                edge.append(item[1])
                edge_list.append(edge)
            else:
                pass
        else:
            edge.append(univs[0].strip())
            edge.append(univs[1].strip())
            edge.append(item[1])
            edge_list.append(edge)
    
    #print len(edge_list), edge_list
    
    node_list = sorted(s.keys(), reverse = False)
    return node_list, edge_list
 def keyword_distribution(self, rm_en = True):
     
     f = open("results/statistics_hashtag_sentiscores_%d.csv" %len(self.keywords),"w")
     f.write("keyword,user_count,tag_count,count,sw_avg_neg,sw_avg_pos,sw_count_neg,sw_count_zero,sw_count_pos,sw_min,sw_max,"
                    +"st_avg_neg,st_avg_pos,st_count_neg,st_count_zero,st_count_pos,st_min,st_max,"
                    +"sc_avg_neg,sc_avg_pos,sc_count_neg,sc_count_zero,sc_count_pos,sc_min,sc_max\n")
     for word in self.keywords:
         hsw = stats.histogram()
         hst = stats.histogram()
         has = stats.histogram()
         for score in self.keywords[word]["scores"]:
             hsw.add(score["sentiword_score"])
             hst.add(score["sentiment_score"])
             has.add(score["sentiscore"])
         
         f.write("%s,%d,%d,%d,%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %(word, self.keywords[word]["user_count"], self.keywords[word]["tag_count"], hsw._count, 
                                                         hsw._mean_neg, hsw._mean_pos, hsw._count_neg, hsw._zero, hsw._count_pos, hsw._min, hsw._max))
         f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %(hst._mean_neg, hst._mean_pos, hst._count_neg, hst._zero, hst._count_pos, hst._min, hst._max))
         f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f\n" %(has._mean_neg, has._mean_pos, has._count_neg, has._zero, has._count_pos, has._min, has._max))
         
         distribution_sw = hsw.histogram_2()
         distribution_st = hst.histogram_2()
         distribution_as = has.histogram_2()
         
         # sentiword score
         x_axis = range(len(distribution_sw))
         rcParams['figure.figsize'] = 24, 5
         x_ = [-1.025 + i*0.05 for i in range(42)]
         plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha = 0.5)
         X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
         for i in range(len(X_ticks)):
             if (i-1)%2 != 0:
                 X_ticks[i] = ""
         plt.xticks(x_axis, X_ticks)
         plt.xlim(0, len(x_axis))
         plt.grid(True)
         plt.xlabel("Sentiword Score")
         plt.ylabel("Percentage (%)")
         plt.title("Sentiword Score Distribution - %s" %word)
         plt.savefig(OUT_DIR+SUB_DIR_1+"%s.png" %word,dpi=50)
         plt.clf()
         # sentiment score
         x_axis = range(len(distribution_st))
         rcParams['figure.figsize'] = 24, 5
         x_ = [-1.025 + i*0.05 for i in range(42)]
         plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha = 0.5)
         X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
         for i in range(len(X_ticks)):
             if (i-1)%2 != 0:
                 X_ticks[i] = ""
         plt.xticks(x_axis, X_ticks)
         plt.xlim(0, len(x_axis))
         plt.grid(True)
         plt.xlabel("Sentiment Score")
         plt.ylabel("Percentage (%)")
         plt.title("Sentiment Score Distribution - %s" %word)
         plt.savefig(OUT_DIR+SUB_DIR_2+"%s.png" %word,dpi=50)
         plt.clf()                    
         # sentiscore score
         x_axis = range(len(distribution_as))
         rcParams['figure.figsize'] = 24, 5
         x_ = [-1.025 + i*0.05 for i in range(42)]
         plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha = 0.5)
         X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))]
         for i in range(len(X_ticks)):
             if (i-1)%2 != 0:
                 X_ticks[i] = ""
         plt.xticks(x_axis, X_ticks)
         plt.xlim(0, len(x_axis))
         plt.grid(True)
         plt.xlabel("Avg Senti Score")
         plt.ylabel("Percentage (%)")
         plt.title("Senti Score Distribution - %s" %word)
         plt.savefig(OUT_DIR+SUB_DIR_3+"%s.png" %word, dpi=50)
         plt.clf()
         print "done %s: neutral_st %d neutral_sw %d neutral_as %d" %(word, hst.count_zero(), hsw.count_zero(), has.count_zero())
     f.close()
Beispiel #18
0
        'lensr_v1': (1.0, 3.0)
    }

    # Start the nestlefit with the hsiao-stretch or the custom_model
    # Save the parameters in 'nestfitparam.dat' and plot the model

    MI_mH = MI_model(mH, 4, f=1.7, useprior=True, samerv=True)
    model, res = nest_lc(phot_d, MI_mH, fit_param, fit_bounds)
    #MI_geu_source = MI_model(geu_source,4)
    #model,res = nest_lc(phot_d,MI_geu_source,fit_param_16geu,fit_bounds)

    # Draw the correlations between the parameters that are degenerated
    samp, params = res.samples, res.vparam_names
    samples(samp, params, fit_param[9:])

    # Draw the probability density function
    df, data = res.ndof, model.tot_amp
    fmin, minamp = min(model.chi), data[np.argmin(model.chi)]
    histogram(data)

    # Plot the lightcurves for the model and write the parameters into a file
    model.plot(phot_d)
    myfile = open('nestfitparam.dat', 'w')
    for name in fit_param:
        myfile.write(name + ' = ' + str(model.get(name)) + ' (' +
                     str(res.errors[name]) + ')\n')
    myfile.write("chiqsquare = " + str(fmin) + ' Dof = ' + str(df) + '\n')
    myfile.write('total amplification = ' + str(minamp) + '\n')
    myfile.write('Bayesian evidence z = ' + str(np.exp(res.logz)))
    myfile.close()
import json
import operator
import pylab
import sys

from stats import histogram
from pylabutils import setupPlot

prev = None
A = []
for line in sys.stdin:
  data = json.loads(line)
  acc = data.get('accuracy')
  if acc:
    A.append(min(999, acc))
w = 10.0
XY = histogram(A, w)
X = map(operator.itemgetter(0), XY)
Y = map(operator.itemgetter(1), XY)
pylab.bar(X, Y, width=w)
setupPlot(
  'Accuracy (m)',
  'Frequency',
  'Accuracy Radius'
)
pylab.savefig('accuracy-histogram.png')
Beispiel #20
0
def train(notations, output):
    hist = stats.histogram(read_notations(notations))
    yaml.dump(hist, output)
Beispiel #21
0
root = etree.parse(sys.stdin).getroot()
document = root.getchildren()[-1]
placemark = document.getchildren()[-1]
track = placemark.getchildren()[-1]
dts = []
prev_when = None
for child in track.iterchildren():
    if child.tag.endswith('when'):
        ts = dateutil.parser.parse(child.text)
        when = int(time.mktime(ts.timetuple()))
        if prev_when is not None:
            dts.append(when - prev_when)
        prev_when = when

rmv = RollingMeanVar(0.001)
Y = []
for i, dt in enumerate(dts):
    rmv.update(dt, i)
    Y.append(rmv.mean())
pylab.plot(range(len(Y)), Y, color=RED)
setupPlot('Sample Index', 'Interval (s)', 'Polling Interval - Rolling Average')
pylab.savefig('timings-frequency.png')
pylab.close()

Y = map(operator.itemgetter(1), histogram(dt % 60 for dt in dts))
pylab.bar(range(len(Y)), Y, width=1.0)
setupPlot('Interval (s, mod 60)', 'Frequency',
          'Polling Interval - Seconds mod 60')
pylab.savefig('timings-second-histogram.png')
pylab.close()
Beispiel #22
0
print
print("ACCOUNTS")
print("--------------------------------------------------")
print("Number of accounts: %s" % (len(accounts)))

print
print("HOURS OF PLAY")
print("--------------------------------------------------")
print("Total play time: %s" % getHourString(stats.sum(table[TABLE_TIME])))
print("Largest play time for a single account: %s" %
      getHourString(max(table[TABLE_TIME])))
print("Median total play time for all accounts: %s" %
      getHourString(stats.median(table[TABLE_TIME])))
print(
    "Macro Histogram play time: \n%s" %
    getHistogramString(stats.histogram(table[TABLE_TIME], 10, [0, 100 * 3600]),
                       getHourString))
print(
    "Micro Histogram play time: \n%s" % getHistogramString(
        stats.histogram(table[TABLE_TIME], 10, [0, 10 * 3600]), getHourString))
print(
    "Pico Histogram play time: \n%s" % getHistogramString(
        stats.histogram(table[TABLE_TIME], 12, [0, 1 * 3600]), getHourString))

print
print("LOGINS")
print("--------------------------------------------------")
print("Total logins: %s" % stats.sum(table[TABLE_LOGINS]))
print("Largest logins for a single account: %s" % max(table[TABLE_LOGINS]))
print("Median number of logins for all accounts: %s" %
      stats.median(table[TABLE_LOGINS]))
import json
import operator
import pylab
import sys

from stats import histogram
from pylabutils import setupPlot

prev = None
A = []
for line in sys.stdin:
    data = json.loads(line)
    acc = data.get('accuracy')
    if acc:
        A.append(min(999, acc))
w = 10.0
XY = histogram(A, w)
X = map(operator.itemgetter(0), XY)
Y = map(operator.itemgetter(1), XY)
pylab.bar(X, Y, width=w)
setupPlot('Accuracy (m)', 'Frequency', 'Accuracy Radius')
pylab.savefig('accuracy-histogram.png')
def read_data(filename, topschoolfile, self_edge = True, extended = True):
    """
    @type filename: string
    @param filename: input file path and name
    
    @type self_edge: Boolean
    @param self_edge: whether self edges are included or not; True-yes, False-not
    
    @type extended: Boolean
    @param extended: whether the graph is extended or restricted in top schools or not; default is True
    
    @return: list of nodes
    @return: list of edges
    """
    top_50 = []
    f = open(topschoolfile,"r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()
    
    ## statistical analysis
    hist = stats.histogram()
    
    stat = {}
    
    s = {}
    edge_list_all = []
    f = open(filename,"r")
#     print f.readline() # skip the first row
    for line in f:
        line = line.lower()
        line = line.strip() # remove those "\r\n"
        lines = line.split(",") ## subject to change
        if len(lines) == 2 or len(lines) == 3:
            if extended == True:
                edge = []
                for i in range(2):
                    edge.append(lines[i].strip())
                    if s.has_key(lines[i].strip()):
                        s[lines[i].strip()] += 1
                    else:
                        s.update({lines[i].strip() : 1})
                if len(lines) == 2: # without year data
                    edge.append("-")
                    
                    if not stat.has_key(lines[0]):
                        stat.update({lines[0] : {'total' : 1, 'wyear' : 0}})
                    else:
                        stat[lines[0]]['total'] += 1
                else: 
                    #print lines
                    if len(lines[2]) > 0: # with year data
                        edge.append(lines[2].strip())
                        hist.add(lines[2].strip())
                        
                        if not stat.has_key(lines[0]):
                            stat.update({lines[0] : {'total' : 1, 'wyear' : 1}})
                        else:
                            stat[lines[0]]['total'] += 1
                            stat[lines[0]]['wyear'] += 1
                    else: # without year data
                        if not stat.has_key(lines[0]):
                            stat.update({lines[0] : {'total' : 1, 'wyear' : 0}})
                        else:
                            stat[lines[0]]['total'] += 1

                edge_list_all.append(edge)
            else:
                if lines[0].strip() in top_50 and lines[1].strip() in top_50:
                    edge = []
                    for i in range(2):
                        edge.append(lines[i].strip())
                        if s.has_key(lines[i].strip()):
                            s[lines[i].strip()] += 1
                        else:
                            s.update({lines[i].strip() : 1})
                    if len(lines) == 2: # without year data
                        edge.append("-")
                        
                        if not stat.has_key(lines[0]):
                            stat.update({lines[0] : {'total' : 1, 'wyear' : 0}})
                        else:
                            stat[lines[0]]['total'] += 1
                    else: 
                        #print lines
                        if len(lines[2]) > 0: # with year data
                            edge.append(lines[2].strip())
                            hist.add(lines[2].strip())
                            
                            if not stat.has_key(lines[0]):
                                stat.update({lines[0] : {'total' : 1, 'wyear' : 1}})
                            else:
                                stat[lines[0]]['total'] += 1
                                stat[lines[0]]['wyear'] += 1
                        else: # without year data
                            if not stat.has_key(lines[0]):
                                stat.update({lines[0] : {'total' : 1, 'wyear' : 0}})
                            else:
                                stat[lines[0]]['total'] += 1
    
                    edge_list_all.append(edge)
        else:
            print "invalid line!", lines
    f.close()
    
#     # statistical
#     f = open("../result/result_may28/me/statistics/year_statistical.csv","w")
#     f.write("univ,total,wyear\n")
#     for key in stat:
#         f.write("%s,%d,%d\n" %(key, stat[key]['total'], stat[key]['wyear']))
#     f.close()
      
#     index, dist, cdf = hist.cdf()
#     print hist._max, hist._min
#     print len(index), index
#     print len(dist), dist
#     print len(cdf), cdf
#     print sum(dist)
#   
#     # the CDF of year distribution
#     f = open("../result/result_may28/ee/statistics/year_cdf.csv","w")
#     f.write("year,freq,percentile\n")
#     for i in range(len(index)):
#         f.write("%s,%d,%.3f\n" %(index[i], int(dist[i]), cdf[i]))
#     f.close()
#    
#     exit(0)

#     univlist = sorted(s.iteritems(), key = lambda asd:asd[0], reverse = False)
#     fo = open("../data/out_me.csv","w")
#     for i in univlist:
#         fo.write("%s,%d\n" %(i[0],i[1]))
#     fo.close()
#     exit(0)

    ## re-organize the edge with weights
    edge_dict = {}
    for edge in edge_list_all:
        key = edge[0]+"#"+edge[1]
        if edge_dict.has_key(key):
            edge_dict[key] += 1.0
        else:
            edge_dict.update({key : 1.0})
    edge_list = []
    for item in edge_dict.iteritems():
        if self_edge == True:
            edge = []
            edge.extend(item[0].split("#"))
            edge.append(item[1])
            edge_list.append(edge)
        else:
            edge = []
            nodes = item[0].split("#")
            if not nodes[0] == nodes[1]:
                edge.extend(nodes)
                edge.append(item[1])
                edge_list.append(edge)
        
    node_list = sorted(s.keys(), reverse = False)

    return node_list, edge_list
  if child.tag.endswith('when'):
    ts = dateutil.parser.parse(child.text)
    when = int(time.mktime(ts.timetuple()))
    if prev_when is not None:
      dts.append(when - prev_when)
    prev_when = when

rmv = RollingMeanVar(0.001)
Y = []
for i, dt in enumerate(dts):
  rmv.update(dt, i)
  Y.append(rmv.mean())
pylab.plot(range(len(Y)), Y, color=RED)
setupPlot(
  'Sample Index',
  'Interval (s)',
  'Polling Interval - Rolling Average'
)
pylab.savefig('timings-frequency.png')
pylab.close()

Y = map(operator.itemgetter(1), histogram(dt % 60 for dt in dts))
pylab.bar(range(len(Y)), Y, width=1.0)
setupPlot(
  'Interval (s, mod 60)',
  'Frequency',
  'Polling Interval - Seconds mod 60'
)
pylab.savefig('timings-second-histogram.png')
pylab.close()
Beispiel #26
0
    def keyword_distribution(self, rm_en=True):

        f = open(
            "results/statistics_hashtag_sentiscores_%d.csv" %
            len(self.keywords), "w")
        f.write(
            "keyword,user_count,tag_count,count,sw_avg_neg,sw_avg_pos,sw_count_neg,sw_count_zero,sw_count_pos,sw_min,sw_max,"
            +
            "st_avg_neg,st_avg_pos,st_count_neg,st_count_zero,st_count_pos,st_min,st_max,"
            +
            "sc_avg_neg,sc_avg_pos,sc_count_neg,sc_count_zero,sc_count_pos,sc_min,sc_max\n"
        )
        for word in self.keywords:
            hsw = stats.histogram()
            hst = stats.histogram()
            has = stats.histogram()
            for score in self.keywords[word]["scores"]:
                hsw.add(score["sentiword_score"])
                hst.add(score["sentiment_score"])
                has.add(score["sentiscore"])

            f.write("%s,%d,%d,%d,%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %
                    (word, self.keywords[word]["user_count"],
                     self.keywords[word]["tag_count"], hsw._count,
                     hsw._mean_neg, hsw._mean_pos, hsw._count_neg, hsw._zero,
                     hsw._count_pos, hsw._min, hsw._max))
            f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %
                    (hst._mean_neg, hst._mean_pos, hst._count_neg, hst._zero,
                     hst._count_pos, hst._min, hst._max))
            f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f\n" %
                    (has._mean_neg, has._mean_pos, has._count_neg, has._zero,
                     has._count_pos, has._min, has._max))

            distribution_sw = hsw.histogram_2()
            distribution_st = hst.histogram_2()
            distribution_as = has.histogram_2()

            # sentiword score
            x_axis = range(len(distribution_sw))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_sw,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Sentiword Score")
            plt.ylabel("Percentage (%)")
            plt.title("Sentiword Score Distribution - %s" % word)
            plt.savefig(OUT_DIR + SUB_DIR_1 + "%s.png" % word, dpi=50)
            plt.clf()
            # sentiment score
            x_axis = range(len(distribution_st))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_st,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Sentiment Score")
            plt.ylabel("Percentage (%)")
            plt.title("Sentiment Score Distribution - %s" % word)
            plt.savefig(OUT_DIR + SUB_DIR_2 + "%s.png" % word, dpi=50)
            plt.clf()
            # sentiscore score
            x_axis = range(len(distribution_as))
            rcParams['figure.figsize'] = 24, 5
            x_ = [-1.025 + i * 0.05 for i in range(42)]
            plt.bar(x_axis,
                    distribution_as,
                    width=0.8,
                    facecolor='blue',
                    alpha=0.5)
            X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))]
            for i in range(len(X_ticks)):
                if (i - 1) % 2 != 0:
                    X_ticks[i] = ""
            plt.xticks(x_axis, X_ticks)
            plt.xlim(0, len(x_axis))
            plt.grid(True)
            plt.xlabel("Avg Senti Score")
            plt.ylabel("Percentage (%)")
            plt.title("Senti Score Distribution - %s" % word)
            plt.savefig(OUT_DIR + SUB_DIR_3 + "%s.png" % word, dpi=50)
            plt.clf()
            print "done %s: neutral_st %d neutral_sw %d neutral_as %d" % (
                word, hst.count_zero(), hsw.count_zero(), has.count_zero())
        f.close()