Ejemplo n.º 1
0
    def plot(self, what='cumulative_payouts', include_ci=True):
        import ggplot as gg #This is hacky ... need to DRY out the imports

        if what == 'cumulative_payouts':
            plt = self._plot_cumulative_payouts(include_ci=include_ci)
        elif what == 'avg_accuracy':
            plt = self._plot_avg_accuracy(include_ci=include_ci)
        elif what == 'all':
            summary = self.summary()
            p1 = self._plot_cumulative_payouts(include_ci=include_ci, summary=summary)
            p2 = self._plot_avg_accuracy(include_ci=include_ci, summary=summary)
            d1 = p1.data
            d2 = p2.data
            d1['Outcome'] = d1['AverageCumulativePayout']
            d2['Outcome'] = d2['AverageAccuracy']
            d1['Plot'] = 'Cumulative Payouts'
            d2['Plot'] = 'Average Accuracy'
            df = d1.append(d2, ignore_index=True)

            if include_ci:
                plt = gg.ggplot(gg.aes(x='Round', y='Outcome', ymin='ymin', ymax='ymax'), data=df) + \
                    gg.geom_area(alpha=0.5)
            else:
                plt = gg.ggplot(gg.aes(x='Round', y='Outcome'), data=df)

            plt += gg.facet_grid('Plot', scales='free')
        else:
            raise ValueError('%s is not a valid option' % what)

        return plt + gg.geom_line()
Ejemplo n.º 2
0
    def _post_density_plot(self, func=None, x_name='', plot_title='', include_doses=None, boot_samps=1000):

        from ggplot import aes, ggplot, geom_density, ggtitle
        import pandas as pd

        if include_doses is None:
            include_doses = range(1, self.num_doses + 1)

        def my_func(x, samp):
            tox_probs = _pi_T(x, mu=samp[:, 0], beta=samp[:, 1])
            eff_probs = _pi_E(x, mu=samp[:, 2], beta1=samp[:, 3], beta2=samp[:, 4])
            u = self.metric(eff_probs, tox_probs)
            return u
        if func is None:
            func = my_func

        x_boot = []
        dose_indices = []
        samp = self.pds._samp
        p = self.pds._probs
        p /= p.sum()
        for i, x in enumerate(self.scaled_doses()):
            dose_index = i+1
            if dose_index in include_doses:
                x = func(x, samp)
                x_boot.extend(np.random.choice(x, size=boot_samps, replace=True, p=p))
                dose_indices.extend(np.repeat(dose_index, boot_samps))
        df = pd.DataFrame({x_name: x_boot, 'Dose': dose_indices})
        return ggplot(aes(x=x_name, fill='Dose'), data=df) + geom_density(alpha=0.6) + ggtitle(plot_title)
Ejemplo n.º 3
0
def displacement_plot(centered, limits=None, style=None):
    u"""Draws nice displacement plots using ggplot2.

    params:
        centered (pd.DataFrame): needs cX, cY, Object, Frame columns, probably
            produced by calling center() above
        limits (real): Sets the limits of the scales to a square window showing
            ±limits on each axis.
        style (Iterable): Collection of strings. Recognized values are 'theme-bw'
            (which uses theme_bw instead of theme_seaborn) and 'no-terminal-dot'
            (which does not label the end of tracks which terminate early).

    Returns:
        g (gg.ggplot): Plot object
    """
    style = {} if style is None else style
    centered['Object'] = centered['Object'].map(str)
    centered = centered.sort(['Frame', 'Object'])
    g = (gg.ggplot(centered, gg.aes(x='cX', y='cY', color='Object')) +
         gg.geom_path(size=0.3))
    g += gg.theme_bw()  # if 'theme-bw' in style else gg.theme_seaborn()
    if limits:
        g = g + gg.ylim(-limits, limits) + gg.xlim(-limits, limits)
    if 'no-terminal-dot' not in style:
        max_frame = centered['Frame'].max()
        endframe = centered.groupby('Object')['Frame'].max()
        endframe = endframe[endframe != max_frame].reset_index()
        endframe = endframe.merge(centered, on=['Object', 'Frame'])
        # we should check if endframe is empty before adding it:
        # https://github.com/yhat/ggplot/issues/425
        if not endframe.empty:
            g += gg.geom_point(data=endframe, color='black', size=1)
    return g
Ejemplo n.º 4
0
def scatter(x, y, filename=""):
    df = pd.DataFrame({ 'x': pd.Series(x), 'y': pd.Series(y) })
    p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point()
    if filename == "":
        print p
    else:
        gg.ggsave(filename="graphs/scatter/"+filename+".png", plot=p)
def plot_update_frequency(result):    
    import pandas as pd
    import numpy
    
    #turns query results into timeseries of chnages
    d = []
    v = []
    for res in result:
        d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime())
        v.append(res['count'])       
        
    ts = pd.DataFrame(v, index = d, columns = ['changes'])
    ts = ts.resample('W', how='sum')
    ts.index.names = ['date']

    import ggplot
    #plots timeseries of changes       
    p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\
            ggplot.geom_point(color = 'blue') +\
            ggplot.xlab('Period') +\
            ggplot.ylab('Changes') +\
            ggplot.geom_smooth() +\
            ggplot.ylim(low = 0) +\
            ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"),  labels = ggplot.date_format('%Y-%m')) +\
            ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week')
    return p
def plot_cost_history(alpha, cost_history):

   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\
          gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
 def plot_roc(self, experiment_type, to_plot):
     # turn this to string for categorical colour scheme
     to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]]
     p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR")
     gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p)
     return
Ejemplo n.º 8
0
    def _plot_cumulative_payouts(self, include_ci=True, summary=None):
        import ggplot as gg
        if summary is None:
            summary = self.summary()

        df = pd.DataFrame({'AverageCumulativePayout': summary['CumulativePayout']['Avg'],
                           'Std': summary['CumulativePayout']['Std'],
                           'Round': range(self.n_rounds)})
        if include_ci:
            df['ymin'] = df.AverageCumulativePayout - 1.96 * df.Std
            df['ymax'] = df.AverageCumulativePayout + 1.96 * df.Std
            plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout', ymin='ymin', ymax='ymax'), data=df) + \
                  gg.geom_area(alpha=0.5)
        else:
            plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout'), data=df)

        return plt + gg.geom_line()
Ejemplo n.º 9
0
def signature_data_plot(sd):
    import ggplot as gg

    aes = gg.aes(x='set_exp', y='not_exp', color='pearson_r')
    return gg.ggplot(aes, data=sd) \
        + gg.geom_point(size=15) \
        + gg.scale_color_gradient(low='yellow', high='red') \
        + gg.scale_x_log() + gg.scale_x_continuous(limits=(0.5, 10000)) \
        + gg.scale_y_log() + gg.scale_y_continuous(limits=(0.05, 10000))
Ejemplo n.º 10
0
def plot_deg_distrib(G):
	(in_deg, out_deg, deg) = wa.degree_distribution(G)
	in_deg_series = pd.Series(in_deg)
	out_deg_series = pd.Series(out_deg)
	in_out = { 'in_deg': in_deg_series, 'out_deg': out_deg_series }
	df = pd.DataFrame(in_out)
	df = pd.melt(df)
	p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'), data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1)
	print p
Ejemplo n.º 11
0
    def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None,
                  groups=None, legend=True):
        palette = self.__default_options__.get('palette', None) if palette is None else palette

        return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \
               geom_histogram(alpha=0.6, breaks=bins, position="fill") + \
               self._palette(palette) + \
               ggtitle(title) + \
               scale_y_continuous(name="Count (%s)" % values)
def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.  
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.  

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
     
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''

    #Ridership by day of week - Option 1 (Entries by Day of Week)
    #pd.options.mode.chained_assignment = None  # default='warn'
    #turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday)

    #plot = gg.ggplot(turnstile_weather, aes('weekday','ENTRIESn_hourly')) + ggtitle('Entries by Day of Week') + xlab('Day of Week') + ylab('Number  of Entries') +gg.geom_histogram(stat = "bar", position = "stack")+ scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

    #Ridership by day of week - Option 2 (Avg number of Entries by Day of Week)
    pd.options.mode.chained_assignment = None  # default='warn'
    turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday)
    averageentries_on_weekday = turnstile_weather.groupby('weekday', as_index=False).ENTRIESn_hourly.mean()
    averageentries_on_weekday.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True)

    plot = gg.ggplot(averageentries_on_weekday, aes('weekday', 'avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by Day of Week') + xlab('Day of Week') + ylab('avg number of Entries')  + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

    #Ridership by Unit(Station) - Option 3 (Entries by UNIT)
    #pd.options.mode.chained_assignment = None  # default='warn'

    #plot = gg.ggplot(turnstile_weather, aes('UNIT','ENTRIESn_hourly')) + ggtitle('Entries by UNIT') + xlab('UNIT') + ylab('Number  of Entries') +gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 100), breaks=range(0, 100, 1))

    #Ridership by day of week - Option 4 (Avg number of Entries by UNIT)
    #pd.options.mode.chained_assignment = None  # default='warn'

    #averageentries_unit = turnstile_weather.groupby('UNIT', as_index=False).ENTRIESn_hourly.mean()
    #averageentries_unit.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True)

    #plot = gg.ggplot(averageentries_unit, aes('UNIT','avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by UNIT') + xlab('UNIT') + ylab('avg number of Entries')  + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 50), breaks=range(0, 50, 1))

    return plot
Ejemplo n.º 13
0
	def plotAverageLatency(self):
		averages = [d.averageLatency() for d in self.data]
		dat = { "device" : range(1, len(averages) + 1), "average" : averages }
		dataframe = pandas.DataFrame(dat)
		chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \
				+ ggplot.labs(title="Average Latency Per Device") + \
				ggplot.ylab("Average Latency (ms)") + \
				ggplot.xlab("Device Number")  + \
				ggplot.geom_bar(stat="identity")
		chart.show()
def plot_weather_data(df):
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly'))
	plot += gp.geom_line()
	plot += gp.ggtitle('Subway Ridership by Day')
	plot += gp.xlab('Date')
	plot += gp.ylab('Exits')
	return plot
def plot_weather_data(df):  # older version
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	
	p_title = 'Subway Ridership by Hour vs Raining'
	p_xlab = 'Hour of the Day'
	p_ylab = 'Subway Entries'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab)
	return plot
Ejemplo n.º 16
0
 def plot(self):
     prob231g_plot_df = self.data.copy()
     for k in range(self.num_clusters):
         n = prob231g_plot_df.shape[0]
         prob231g_plot_df.loc[n] = self.cluster_centers[k]
     prob231g_plot_df["class_label"] = [label for label in self.class_label] + \
                                       self.num_clusters * ["center"]
     p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
         gg.geom_point() + gg.ggtitle("EM cluster assignments")
     print p
     return
Ejemplo n.º 17
0
    def heatmap(self, dataframe, y=None, x=None, values=None, width=None, height=None,
                max_color=None, min_color=None, mid_color=None, title='Heatmap'):
        max_color = self.__default_options__.get('max_color', None) if max_color is None else max_color
        min_color = self.__default_options__.get('min_color', None) if min_color is None else min_color
        mid_color = self.__default_options__.get('mid_color', None) if mid_color is None else mid_color
        width = self.__default_options__.get('width', None) if width is None else width

        palette = gradient(min_color, mid_color, max_color)
        return ggplot(dataframe, aes(x=x, y=y, fill=values)) + \
               geom_tile() + \
               self._palette(palette, "div")
Ejemplo n.º 18
0
def lineplot(hr_year_csv):
    df = pandas.read_csv(hr_year_csv)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR"))
        + gp.geom_point(color="red")
        + gp.geom_line(color="red")
        + gp.ggtitle("Homeruns by Year")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Ejemplo n.º 19
0
def lineplot_compare(filename):
    df = pd.read_csv(filename)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle("Homeruns by Year by Team")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Ejemplo n.º 20
0
def prob231cd_recover(initialization):
    filename = "results/prob231cd" + initialization
    tuple_in = pkl.load(open(filename + ".pkl", "rb"))
    prob231c_plot_df = tuple_in[0]
    kmcalls = tuple_in[1]
    num_trials = tuple_in[2]
    p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \
        gg.geom_point() + gg.ggtitle(initialization + " initialization")
    gg.ggsave(filename + ".png", plot = p)
    obj = [kmcalls[i].obj for i in range(num_trials)]
    obj_stats = {"mean":np.mean(obj), "sd":np.std(obj), "min":np.min(obj)}
    return obj_stats
Ejemplo n.º 21
0
    def _plot_avg_accuracy(self, include_ci=True, summary=None):
        import ggplot as gg
        if summary is None:
            summary = self.summary()

        df = pd.DataFrame({'AverageAccuracy': summary['Accuracy']['Avg'], 'Round': range(self.n_rounds)})

        if include_ci:
            from scipy import stats
            succ = df.AverageAccuracy * self.n_sim
            fail = self.n_sim - succ
            interval = stats.beta(succ + 1, fail + 1).interval(0.95)

            df['ymin'] = interval[0]
            df['ymax'] = interval[1]
            plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy', ymin='ymin', ymax='ymax'), data=df) + \
                gg.geom_area(alpha=0.5)
        else:
            plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy'), data=df)

        return plt + gg.geom_line()
Ejemplo n.º 22
0
    def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None,
                yaxis_label=None):
        color = self.__default_options__.get('palette', None) if color is None else color
        width = self.__default_options__.get('width', None) if width is None else width

        gg = ggplot(dataframe, aes(x, y)) + geom_point(color=color, alpha=0.6) + ggtitle(title)
        if xaxis_label:
            gg += scale_x_continuous(name=xaxis_label)
        if yaxis_label:
            gg += scale_y_continuous(name=xaxis_label)

        return gg
Ejemplo n.º 23
0
def plotHistogramMeans(hist,fileName):
  num_clust = hist.shape[0]
  IDS = np.mat(range(0,num_clust))
  IDS = IDS.reshape(num_clust,1)

  histD = np.concatenate((IDS,hist),axis=1)

  Data = pd.DataFrame(histD,columns = ['ID']+range(0,hist.shape[1]))
  Melted = pd.melt(Data,id_vars=['ID'])
  pv =  ggplot.ggplot( ggplot.aes(x='variable',y='value'),data=Melted) +  ggplot.geom_line()  + ggplot.facet_wrap("ID")
  print "Saving mean histograms"
  ggplot.ggsave(pv,'./IMG/'+fileName)
Ejemplo n.º 24
0
def _ggplot(df, out_file):
    """Plot faceted items with ggplot wrapper on top of matplotlib.
    XXX Not yet functional
    """
    import ggplot as gg
    df["variant.type"] = [vtype_labels[x] for x in df["variant.type"]]
    df["category"] = [cat_labels[x] for x in df["category"]]
    df["caller"] = [caller_labels.get(x, None) for x in df["caller"]]
    p = (gg.ggplot(df, gg.aes(x="caller", y="value.floor")) + gg.geom_bar()
         + gg.facet_wrap("variant.type", "category")
         + gg.theme_seaborn())
    gg.ggsave(p, out_file)
Ejemplo n.º 25
0
def prob231b(initialization = "regular"):
    cluster_counts = [2,3,5,10,15,20]
    kmcalls = [0 for i in cluster_counts]
    for i, num_clusters in enumerate(cluster_counts):
        kmcalls[i] = KmeansCall(features_only, num_clusters, initialization)
        kmcalls[i].run_kmeans(verbose = False)

        df_to_plot = kmcalls[i].data.copy()
        df_to_plot["class_label"] = [label for label in kmcalls[i].class_label]
        p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters))
        metadata = "k=" + str(num_clusters) + "_" + datestring
        gg.ggsave(filename = "results/" + metadata +".png", plot = p)
Ejemplo n.º 26
0
    def plot_outcomes(self, chart_title=None, use_ggplot=False):
        """ Plot the outcomes of patients observed.

        :param chart_title: optional chart title. Default is fairly verbose
        :type chart_title: str
        :param use_ggplot: True to use ggplot, else matplotlib
        :type use_ggplot: bool
        :return: a plot of patient outcomes

        """

        if not chart_title:
            chart_title="Each point represents a patient\nA circle indicates no toxicity, a cross toxicity"
            chart_title = chart_title + "\n"

        if use_ggplot:
            if self.size() > 0:
                from ggplot import (ggplot, ggtitle, geom_text, aes, ylim)
                import numpy as np
                import pandas as pd
                patient_number = range(1, self.size()+1)
                symbol = np.where(self.toxicities(), 'X', 'O')
                data = pd.DataFrame({'Patient number': patient_number,
                                     'Dose level': self.doses(),
                                     'DLT': self.toxicities(),
                                     'Symbol': symbol})

                p = ggplot(data, aes(x='Patient number', y='Dose level', label='Symbol')) \
                    + ggtitle(chart_title) + geom_text(aes(size=20, vjust=-0.07)) + ylim(1, 5)
                return p
        else:
            if self.size() > 0:
                import matplotlib.pyplot as plt
                import numpy as np
                patient_number = np.arange(1, self.size()+1)
                doses_given = np.array(self.doses())
                tox_loc = np.array(self.toxicities()).astype('bool')
                if sum(tox_loc):
                    plt.scatter(patient_number[tox_loc], doses_given[tox_loc], marker='x', s=300,
                                facecolors='none', edgecolors='k')
                if sum(~tox_loc):
                    plt.scatter(patient_number[~tox_loc], doses_given[~tox_loc], marker='o', s=300,
                                facecolors='none', edgecolors='k')

                plt.title(chart_title)
                plt.ylabel('Dose level')
                plt.xlabel('Patient number')
                plt.yticks(self.dose_levels())
                p = plt.gcf()
                phi = (np.sqrt(5)+1)/2.
                p.set_size_inches(12, 12/phi)
Ejemplo n.º 27
0
def lineplot_compare(filename):  # Cleaner version with string vars
    df = pd.read_csv(filename)
    p_title = "Homeruns by Year by Team"
    p_xlab = "Homeruns"
    p_ylab = "Year"
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle(p_title)
        + gp.xlab(p_xlab)
        + gp.ylab(p_ylab)
    )
    return gg
Ejemplo n.º 28
0
    def plot(self, inputs):
        """Plot the given X and Y axes on a scatter plot"""
        if inputs.year not in self.dat.Year.values:
            return

        if inputs.xvar not in self.dat or inputs.yvar not in self.dat:
            return

        subdat = self.dat[self.dat.Year == inputs.year]
        p = ggplot(subdat, aes(x=inputs.xvar, y=inputs.yvar))

        p = p + geom_point()
        if inputs.shownames:
            p = p + geom_text(aes(label=self.ID_col), vjust=1, hjust=1)
        if inputs.linear:
            p = p + stat_smooth(color="red", method="lm")
        return p
Ejemplo n.º 29
0
def main(log):

    log.debug('initializing app')
    p = pyaudio.PyAudio()

    # Open audio input stream
    stream = p.open(format = FORMAT,
        channels = CHANNELS,
        rate = SAMPLE_RATE,
        input = True,
        frames_per_buffer = CHUNK_SIZE)

    log.debug('opened stream <{}>'.format(stream))
    log.debug('reading audio input at rate <{}>'.format(SAMPLE_RATE))

    recorded = []

    # Start mainloop
    loops = 0
    while True:
        loops += 1
        if loops % 25 == 0: log.debug('recorded <{}> loops'.format(loops))

        # Decode chunks of audio data from the stream
        try:
            data = stream.read(CHUNK_SIZE)
            decoded = np.fromstring(data, 'Float32');
            mx = max(decoded)
            recorded.append(mx)

        # On <C-c>, plot max of recorded data
        except KeyboardInterrupt as ee:
            log.debug('closing stream and ending PyAudio')
            stream.close()
            p.terminate()
            df = pd.DataFrame(columns = ['mx', 'time'])
            df['mx'] = recorded
            df['time'] = range(len(recorded))
            plt = ggplot.ggplot(ggplot.aes(x='time', y='mx'), data=df) +\
                        ggplot.geom_line()
            pdb.set_trace()
            log.debug('quitting')
            sys.exit(1)
Ejemplo n.º 30
0
def prob231g():
    filename = "results/prob231g"

    num_clusters_231g = 3
    emcall = EMCall(features_only, labels_only, num_clusters_231g)
    emcall.run_em()

    plt.plot(emcall.log_likelihood_record)
    plt.title("Likelihood over EM iterations")
    plt.savefig(filename + "_loglike.png")

    prob231g_plot_df = emcall.data.copy()
    prob231g_plot_df["class_label"] = [label for label in emcall.class_label]
    p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("EM cluster assignments")
    gg.ggsave(filename + "_clusters.png", plot = p)

    pkl.dump(obj = emcall, file = open(filename + "_a.pkl", "wb"))
    print("Done with 231g.")
    return
Ejemplo n.º 31
0
    for t_float in time:
        tp_FS, tp_PK = get_weibull(t=t_float,
                                   coverage=input_par['uptake'],
                                   duration=input_par['duration'],
                                   shape=s)
        plot_prob.loc[row_idx, 'Monthly transition probability'] = tp_FS
        plot_prob.loc[row_idx + 1, 'Monthly transition probability'] = tp_PK
        plot_prob.loc[row_idx, 'time'] = t_float
        plot_prob.loc[row_idx + 1, 'time'] = t_float
        plot_prob.loc[row_idx, 'Formula'] = 'FS'
        plot_prob.loc[row_idx + 1, 'Formula'] = 'PK'

        row_idx += 2

    # collect
    collect_prob['FS ' + str(s)] = plot_prob.loc[
        plot_prob.loc[:, 'Formula'] == 'FS',
        'Monthly transition probability'].values
    collect_prob['PK ' + str(s)] = plot_prob.loc[
        plot_prob.loc[:, 'Formula'] == 'PK',
        'Monthly transition probability'].values

    # plot
    x = ggplot(aes(
        x='time', y='Monthly transition probability', color='Formula'),
               data=plot_prob) + geom_line()
    #name = r'Shape: ' + str(s)# + r', Coverage/Uptake = ' + str(input_par['uptake']*100) + r', Coverage time = ' + str(input_par['duration']) + '.jpg'
    x.save('Weibull' + str(plot_num))

    plot_num += 1
Ejemplo n.º 32
0
def second(dataframe):
    plot = ggplot.ggplot(
        ggplot.aes(x='Speed'),
        data=dataframe) + ggplot.geom_bar(color='lightblue') + ggplot.ggtitle(
            "Frequencies of Speeds Among Interfaces") + ggplot.theme_xkcd()
    plot.show()
Ejemplo n.º 33
0
# 文字说明
plt.plot([1, 2, 3])
plt.text(1, 2, r'$\mu=100, \sigma=15$')
plt.show()

plt.plot([1, 2, 3])
plt.annotate('test',
             xy=(1, 2),
             xytext=(1.5, 2.2),
             arrowprops=dict(facecolor='black', shrink=0.05))
plt.show()

# ggplot示例
import ggplot as gg
p = gg.ggplot(df2, aes(x='MSales', y='Comments'))
p = p + geom_point()
print(p)

# 2.可视化与数据分析
# 简单数据分析
import numpy as np
df3 = df1[df1['Place'] == '广东 广州'].append(df1[df1['Place'] == '浙江 杭州'])

np.mean(df3['MSales'])  #求MSales变量均值
np.std(df3['MSales'])  #求MSales变量标准差
np.median(df3['MSales'])  #求MSales变量中位数
np.percentile(df3['MSales'], 25)  #求MSales变量分位数

df3.groupby(by='Place').count()  #根据Place分组,计算频数
df3.groupby(by='Place').mean()  #根据Place分组,计算均值
Ejemplo n.º 34
0
    def plot(self, inp1, inp2, inp3=None):
        p = gg.ggplot(gg.aes(x=inp1, y=inp2, color=inp3), data=self.data) + \
        gg.geom_point()

        print p
Ejemplo n.º 35
0
                           m=1,
                           alpha=alpha))

    #save results to results folder, with plot and printing to screen.
    metadata = datetime.datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode)
    f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode='w')
    pkl.dump(obj=results, file=f)

    logtimes = [math.log(r.avg_time, 2) for r in results]
    distances = [r.avg_distance for r in results]
    methods = [r.method[0:3] for r in results]
    alpha = [r.alpha for r in results]
    m = [r.m for r in results]
    results_df = pd.DataFrame(
        data={
            "logtimes": logtimes,
            "distances": distances,
            "methods": methods,
            "m": m,
            "alpha": alpha
        })
    print results_df
    p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes",
                                                         y = "distances",
                                                         label = "methods")) + \
        gg.geom_text() + \
        gg.ggtitle("LSH and KD trees: tradeoffs") + \
        gg.xlab("Log2 average query time  ") + gg.ylab("Average L2 distance from query point)")
    gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot=p)
Ejemplo n.º 36
0
def plot_components(df, title, file_loc, experiment_number, dataset_name):
    chart = ggplot(df, aes(x='comp-one', y='comp-two', color='label')) \
            + geom_point(size=75, alpha=0.8) \
            + ggtitle(title)
    chart.save("images/experiment_" + str(experiment_number) + "/" +
               dataset_name + "/" + file_loc + "/" + title + ".png")
Ejemplo n.º 37
0
f = '\t{:18s} = {:5.2f}'
print('\nErgodic Means')
print(f.format('Profit Contribution', data['profit'].mean()))
print(f.format('Activity', (data['i'] == 'active').mean()))
print('\nErgodic Standard Deviations\n')
print(f.format('Profit Contribution', data['profit'].std()))
print(f.format('Activity', (data['i'] == 'active').std()))

# Plot Simulated and Expected Continuous State Path
data2 = data[['time', 'profit']].groupby('time').mean()
data2['time'] = data2.index

print(data2)
print(data2.columns)

ppp = ggplot(aes('time','profit'),
             data=data2) + \
      geom_line()

print(ppp)

ppp = ggplot(aes('time','profit','_rep'),
             data=data[data._rep <3]) + \
      geom_point() + \
      geom_line(aes('time','profit'), data=data2)

print(ppp)

print(
    demo.qplot('time', 'profit', '_rep', data=data[data._rep < 3], geom='line')
    + geom_line(aes('time', 'profit'), data=data2))
'''
Ejemplo n.º 38
0
import ggplot as gg
import ultrasignup as us
import numpy as np

d = us.event_results(299)

p1 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("50K Finishing Times for All Years")

p2 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("11M Finishing Times for All Years")
Ejemplo n.º 39
0
count_vect = CountVectorizer()
kk = count_vect.fit_transform(subjects_train)

analyze = count_vect.build_analyzer()

subjects_words_count = subjects_train.apply(lambda x: len(analyze(x)))

print(subjects_words_count.describe())

#%%
import ggplot as gg

df = pd.DataFrame(subjects_words_count, columns = ["count"])

hist =  gg.ggplot(df, gg.aes(x = "count"))
hist += gg.xlab("# of words") +\
        gg.ylab("Frequency") +\
        gg.ggtitle("Frequency of words")

hist += gg.geom_vline(x = df.mean(), color="red")
hist += gg.geom_vline(x = df.median(), color="blue")
hist += gg.geom_density(color="green")
hist += gg.geom_histogram(binwidth=1, color="grey")

hist

#%%

# 1st attemtp to classify subjects per tag
Ejemplo n.º 40
0
data = []
for method in methods:
    for model in models:
        for rtol in rtols:
            print('method: {} model: {} rtol: {}'.format(
                method.name, model.name, rtol),
                  end='')

            # Run
            tic = time.time()
            result = method(model, rtol)
            toc = time.time() - tic

            # Compare to gold standard
            standard = gold_standards[model.name]
            diff = result - standard.values
            max_rel_diff = np.max(diff / standard.max)

            # Append to table
            record = (method.name, model.name, rtol, max_rel_diff, toc)
            print(' err: {} toc: {}'.format(max_rel_diff, toc))
            data.append(record)

data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time'])

print(
    gg.ggplot(data, gg.aes(x='err', y='time', color='method')) +
    gg.geom_point(size=60.0) + gg.geom_line() + gg.scale_x_log() +
    gg.scale_y_log() + gg.xlim(1e-10, 1e-2))
def ggplot_img(xt):
    xt = pd.DataFrame({'n': range(len(xt)), 'xt': xt})
    p = gp.ggplot(gp.aes(x='n', y='xt'), data=xt) + gp.geom_line(color='black')
    print(p)
def plot_after_transmission_results(data, path_names):

    # import input data for tranmission analysis
    var_and_val = pd.DataFrame(columns=['x', 'Variable'], index=range(0, 12))
    plot_lm = pd.DataFrame(
        columns=['x', 'Life Months', 'Scenario', 'Variable'],
        index=range(0, 24))
    data_in = pd.read_excel(
        os.path.join(path_names['transmission'], 'Input files',
                     'transmission_rate_multiplier_required_inputs.xlsx'))
    col = [
        'Yearly incidence in MSM',
        'Number of HIV uninfected individuals (HRG size)',
        'Number of HIV infected individuals in primary cohort at t=0'
    ]
    col_adj = ['Incidence', 'Uninfected', 'Infected']
    base_val = [0.009, 2960000, 136400]

    for i in range(len(col)):
        idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i],
                          col[i]].index.values[0]
        var_and_val.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3,
                                                                col[i]].values
        var_and_val.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i]

    row_idx = -2
    var_idx = [-1, -1, -1]
    for var in data:

        if 'HIV+' in var:
            var_idx[2] += 1
            plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[2], 'x'].values[var_idx[2]]
            plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[2],
                'Variable'].values[var_idx[2]]
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values
        elif 'HIV-' in var:
            var_idx[1] += 1
            plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[1], 'x'].values[var_idx[1]]
            plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[1],
                'Variable'].values[var_idx[1]]
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values
        elif 'Incidence' in var:
            var_idx[0] += 1
            plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[0], 'x'].values[var_idx[0]]
            plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[
                var_and_val['Variable'] == col_adj[0],
                'Variable'].values[var_idx[0]]
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values
            plot_lm.loc[
                row_idx:row_idx + 1,
                'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values

        row_idx += 2

    # plot
    save_path = os.path.join(path_names['transmission'], r'Input files',
                             r'Plots for final runs')
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    (ggplot(aes(x='x', y='Life Months', color='Scenario'), plot_lm) +
     geom_line() + facet_wrap('Variable', scales='free')).save(
         os.path.join(save_path, 'Comparison of '))

    return
def plot_transmission_results(tx_results, percentage_decline, save_path,
                              path_names):

    #%% what are inputs?

    # transmission results
    # There'll be a folder called 'Runs prepared for ...'
    # all the folders inside that folder will have a CEPAC results folder.
    # tx_data is a dictionary and will have two keys, 'monthly' and 'popstats'
    # 'monthly' key will only have primary transmissions data
    tx_data = deepcopy(tx_results)
    t = 120
    total_var = 3
    total_val = 4
    # percentage decline
    # this is also dictionary of percentage decline values for each folder
    # having cepac results

    # save_path eaxact folder where you want to save your images

    # path_names will have paths to transmissions and sensitivity directories

    #%% plot percentage decline

    # geberate an environment object first
    # lets go for line plot
    data_plot = pd.DataFrame(
        columns=['x', 'Percentage decline', 'Transmissions', 'Variable'],
        index=range(0, total_var * total_val))
    data_in = pd.read_excel(
        os.path.join(path_names['transmission'], 'Input files',
                     'transmission_rate_multiplier_required_inputs.xlsx'))
    col = [
        'Incidence rate per 100 PY specific to high-risk group 1',
        'HIV uninfected individuals in high-risk group 1',
        'HIV infected individuals in high-risk group 1'
    ]
    col_adj = ['Incidence', 'Uninfected', 'Infected']
    data_in[col[0]] = data_in[col[0]].round(1)
    base_val = [np.float64(0.9), 2960000, 136400]
    y1_values = {col[0]: [], col[1]: [], col[2]: []}
    for var in percentage_decline:
        if 'HIV+' in var:
            y1_values[col[2]].append(percentage_decline[var])
        elif 'HIV-' in var:
            y1_values[col[1]].append(percentage_decline[var])
        elif 'Incidence' in var:
            y1_values[col[0]].append(percentage_decline[var])

    for i in range(len(col)):
        idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i],
                          col[i]].index.values[0]
        data_plot.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3,
                                                              col[i]].values
        data_plot.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i]
        data_plot.loc[idx - 1:idx + 3 - 1,
                      'Percentage decline'] = y1_values[col[i]]

    # plot
    df_float = data_plot.loc[data_plot.loc[:, 'Percentage decline'] <= 200, :]
    (ggplot(aes(x='x', y='Percentage decline'), df_float) + geom_line() +
     facet_wrap('Variable', scales='free')).save(
         os.path.join(save_path, 'Percentage decline'))
    del df_float

    #%% visualizing transmissions
    # index = range(time * number of values for each variable * number of variables)
    def set_abc(run, var_idx, var_name, var_value_idx):

        # set variable names
        data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1,
                         'Variable'] = var_name

        # set variable value
        data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1,
                         'Value'] = data_plot.loc[
                             data_plot.loc[:, 'Variable'] == var_name,
                             'x'].values[var_value_idx]

        if 'RunA' in run:
            data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1,
                             'RunA tx'] = tx_data[var]['monthly'][run][
                                 'transmissions'].iloc[0:t].values
        elif 'RunB' in run:
            data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1,
                             'RunB tx'] = tx_data[var]['monthly'][run][
                                 'transmissions'].iloc[0:t].values
        elif 'RunC' in run:
            data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1,
                             'RunC tx'] = tx_data[var]['monthly'][run][
                                 'transmissions'].iloc[0:t].values

    data_plot_tx = pd.DataFrame(
        index=range(t * total_var * total_val),
        columns=['Variable', 'Value', 'RunA tx', 'RunB tx', 'RunC tx'])
    var_idx = -1
    var_val_idx = [-1, -1, -1]
    for var in tx_data:
        var_idx += 1
        if 'HIV+' in var:
            var_val_idx[2] += 1
            var_name = col_adj[2]
            for run in tx_data[var]['monthly']:
                set_abc(run, var_idx, var_name, var_val_idx[2])
        elif 'HIV-' in var:
            var_val_idx[1] += 1
            var_name = col_adj[1]
            for run in tx_data[var]['monthly']:
                set_abc(run, var_idx, var_name, var_val_idx[1])
        elif 'Incidence' in var:
            var_val_idx[0] += 1
            var_name = col_adj[0]
            for run in tx_data[var]['monthly']:
                set_abc(run, var_idx, var_name, var_val_idx[0])
        else:
            continue

    data_plot_tx['t'] = 0
    t_float = -1
    for row in data_plot_tx.index:
        if t_float == t - 1:
            t_float = -1
        t_float += 1
        data_plot_tx.loc[row, 't'] = t_float

    #%% plots for individual runs
    run_col = ['RunA tx', 'RunB tx', 'RunC tx']
    inci = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Incidence', :]
    inf = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Infected', :]
    uninf = data_plot_tx.loc[data_plot_tx.loc[:,
                                              'Variable'] == 'Uninfected', :]
    for i in run_col:
        (ggplot(aes(x='t', y=i, color='Value'), data_plot_tx) + geom_line() +
         facet_wrap('Variable', scales='free')).save(
             os.path.join(
                 save_path,
                 str(i + r'_transmissions for all variable all values')))
        (ggplot(aes(x='t', y=i), inci) + geom_line() +
         facet_wrap('Variable', 'Value', scales='free')).save(
             os.path.join(
                 save_path,
                 str(i + r'_plots for individual values of incidence')))
        (ggplot(aes(x='t', y=i), inf) + geom_line() +
         facet_wrap('Variable', 'Value', scales='free')).save(
             os.path.join(
                 save_path,
                 str(i +
                     r'_plots for individual values of infected population')))
        (ggplot(aes(x='t', y=i), uninf) + geom_line() +
         facet_wrap('Variable', 'Value', scales='free')).save(
             os.path.join(
                 save_path,
                 str(i +
                     '_plots for individual values of uninfected population')))

    #%% compare runs ABC
    data_plot_abc = {}
    for var in col_adj:
        float_df = pd.DataFrame(index=range(0, t * total_var * total_val),
                                columns=['t', 'Value', 'Transmissions', 'Run'])
        insert_idx = -1
        for val in data_plot.loc[data_plot.loc[:, 'Variable'] == var, 'x']:
            var_df = data_plot_tx.loc[data_plot_tx.loc[:,
                                                       'Variable'] == var, :]
            var_df = var_df.reset_index(drop=True)
            var_val_df = var_df.loc[var_df.loc[:, 'Value'] == val, :]
            var_val_df = var_val_df.reset_index(drop=True)
            for c in ['RunA tx', 'RunB tx', 'RunC tx']:
                insert_idx += 1
                float_df.loc[insert_idx * t:(insert_idx * t) + t - 1,
                             'Run'] = c
                float_df.loc[insert_idx * t:(insert_idx * t) + t - 1,
                             'Transmissions'] = var_val_df.loc[:, c].values
                float_df.loc[insert_idx * t:(insert_idx * t) + t - 1,
                             'Run'] = c
                float_df.loc[insert_idx * t:(insert_idx * t) + t - 1,
                             'Value'] = val
                float_df.loc[insert_idx * t:(insert_idx * t) + t - 1,
                             't'] = np.arange(t)
        data_plot_abc[var] = float_df.dropna()
        (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) +
         geom_line() + facet_wrap('Value', scales='free') + ggtitle(var)).save(
             os.path.join(
                 save_path,
                 str(var + '_comparison of transmissions in runs ABC')))

    #%% compare runs BC
    for var in data_plot_abc:
        float_df = data_plot_abc[var].loc[
            data_plot_abc[var].loc[:, 'Run'] != 'RunA tx', :]
        (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) +
         geom_line(alpha=0.2) + facet_wrap('Value', scales='free') +
         stat_smooth(method='loess', se=False) + ggtitle(var)).save(
             os.path.join(save_path,
                          str(var +
                              '_comparison of transmissions in runs BC')))

    return
"""Plot target variable as time series."""

import get_data
from ggplot import aes, geom_line, facet_wrap, ggplot


if __name__ == "__main__":

    df = get_data.get_all_data()

    p = ggplot(df, aes('datetime', 'cap', group='date')) + \
        geom_line(alpha=0.2) + \
        facet_wrap('name')
    p.save('../output/time_series.pdf')
Ejemplo n.º 45
0
from plot import plot_trace_points, plot_trace_path
from preprocess import smooth_spline, smooth_regress
from kalman import kalman_filter

import pandas as pd

from ggplot import ggplot, aes, geom_point, geom_line

if __name__ == '__main__':
    df = pd.DataFrame(
        data={
            't': [1, 2, 3, 4, 5],
            'lat': [10, 12, 10, 9, 8],
            'lon': [100, 99, 98, 95, 97]
        })
    df2 = smooth_spline(df, 0.1)
    df3 = smooth_regress(df, 0.1, 4)
    #df4 = smooth_regress(df, 0.1, 3)
    df4 = kalman_filter(df, 0.1)

    p = ggplot(aes(x='lat', y='lon'),
               data=pd.DataFrame(columns=('lat', 'lon'), data={}))

    p += plot_trace_points(df, color='black')
    p += plot_trace_path(df2, color='red')
    p += plot_trace_path(df3, color='green')
    p += plot_trace_path(df4, color='blue')

    p.save('test.png')
                                             (vcfdf['TestBias']=='Pass') &
                                             (vcfdf['CHROM']==reference) ]['Pi']))
    return testwindows



# Generate new dataframe with analyses performed per window
if options.graphics == True:
	print "Analysing by "+ str(windowsize) +"sliding windows and generating plots"
	windowed_df = pd.DataFrame({'window':sorted(list(set(vcfdf['window']))),
        	                   'MaxMinor':windowMax(sorted(list(set(vcfdf['window'])))),
                	           'Pi':windowPi(sorted(list(set(vcfdf['window']))))})


# Now try and plot graph
	p_MaxMinor = gg.ggplot(gg.aes('window', 'MaxMinor'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Minor Variant Frequency (%)") +gg.ggtitle(vcfoutput + "\n Valid Minor Variant Sites :" + str(len(minorvar))) 


# Plot Nucleotide Diversity (Pi) along genome 
	p_pi =gg.ggplot(gg.aes('window', 'Pi'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Mean nucleotide diversity (" + u"\u03c0" +")") +gg.scale_y_continuous(expand=(0,0),limits=(0, windowed_df['Pi'].max(axis=0)+0.001)) +gg.ggtitle(vcfoutput + "\n Genome-wide Mean Nucleotide Diversity (" +u"\u03c0"+ ") :" +str(round(gw_Pi,6))) 

#p_pi

# Facetted plot (still not sorted y axes labels yet)
	windowed_df_melt = pd.melt(windowed_df, id_vars=['window'])
	p_combi = gg.ggplot(gg.aes('window', 'value',colour='variable'),data=windowed_df_melt)
	p_combi = p_combi + gg.geom_point(colour='variable') + gg.facet_grid('variable',scales='free_y')+gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")")

# Print graphs to .png
	p_combi.save(vcfinput + ".MinorVar_combo.png")
	p_MaxMinor.save(vcfinput + ".MinorVar.png")
        (176, 208)).astype(float))
    plt.tight_layout()
plt.show()

# In[206]:

n_sne = 7000
tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne], feat_cols].values)

# In[207]:

df_tsne = df.loc[rndperm[:n_sne], :].copy()
df_tsne['x-tsne'] = tsne_results[:, 0]
df_tsne['y-tsne'] = tsne_results[:, 1]

view = ggplot.ggplot(df_tsne, aes(
    x='x-tsne', y='y-tsne', color='label')) + geom_point(
        size=70, alpha=0.2) + ggtitle("tSNE dimensions colored by digit")

# In[208]:

view

# In[209]:

# --logdir=/Users/glynisttheisen/Desktop/Final

# # PCA

# In[210]:

m_PCA = PCA(n_components=10)
Ejemplo n.º 48
0
        index=range(t * len(count_tops),
                    t * len(count_tops) + len(count_tops)))
    probs_list.append(probs_t)
    # Calculate KL divergences
    kl_mle_list.append(stats.entropy(true_bins_t, mle_probs_vals))
    kl_nn_list.append(stats.entropy(true_bins_t, nn_probs_t))

probs = pd.concat(probs_list)

# In[44]:

probs_tail = probs[probs.Tenor > 360]

gg.ggplot(probs_tail, gg.aes(x='Count Top', weight='Probs True')
          ) + gg.facet_grid('Tenor') + gg.geom_bar() + gg.geom_step(
              gg.aes(y='Probs MLE', color='red')) + gg.geom_step(
                  gg.aes(y='Probs NN', color='blue')) + gg.scale_x_continuous(
                      limits=(0, len(count_tops)))

# In[57]:

# KL divergences

kl_df = pd.DataFrame({
    'Tenor': range(0, t_end + 1),
    'KL MLE': kl_mle_list,
    'KL NN': kl_nn_list
})

print kl_df.head()
print kl_df.tail()
Ejemplo n.º 49
0
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 前几行")
print(df.head())

#text = df.comments.iloc[0]   单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始
#s = SnowNLP(text)
#
#print(s.sentiments)


def get_sentiment_cn(text):
    s = SnowNLP(text)
    return s.sentiments


df["sentiment"] = df.comments.apply(get_sentiment_cn)
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值")
print(df)

print("#######################################")
print("重要信息")
print("所有影评的平均值为:", df.sentiment.mean())
print("所有影评的中位数为:", df.sentiment.median())

ggplot.ggplot(ggplot.aes(x="date", y="sentiment"),
              data=df) + ggplot.geom_point() + ggplot.geom_line(
                  color='blue') + ggplot.scale_x_date(
                      labels=ggplot.date_format("%Y-%m-%d"))

df.sort_values(['sentiment'])[:5]
#%% PC Regression
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg,
                         X95[:,:10],
                         Y) 
scores.mean()

#%% Partial Least Squares
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=10)
Xpls, Ypls = pls.fit_transform(X,Y)

#%% Visualization with labeling
import ggplot as gg
df1['x1'], df1['x2'] = Xpca[:,0],Xpca[:,1]
chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \
                  + gg.geom_point(size=10, alpha=.8) 
chart.show()
#%% PLS transformation
df1['x1'], df1['x2'] = Xpls[:,0],Xpls[:,1]
chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \
                  + gg.geom_point(size=10, alpha=.8) 
chart.show()
#%% Feature Selection with Elastic Net
scaler = StandardScaler()
Xscale = scaler.fit_transform(X)
from sklearn.linear_model import ElasticNet
enet_reg = ElasticNet(alpha=.1, l1_ratio=.5)
enet_reg.fit(Xscale,Y)
nonzero = enet_reg.coef_ != 0
print(nonzero.sum(),'non-zero of',len(enet_reg.coef_),'coefficients.')
Ejemplo n.º 51
0
# time_start = time.time()

# tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
# tsne_pca_results = tsne.fit_transform(pca_result)
# np.save('tsne_pca_results.npy', tsne_pca_results)

# print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
tsne_pca_results = np.load('tsne_pca_results.npy')
print(tsne_pca_results.shape)
df_tsne = None
df_tsne = df.loc[:, :].copy()
df_tsne['x-tsne-pca'] = tsne_pca_results[:, 0]
df_tsne['y-tsne-pca'] = tsne_pca_results[:, 1]

df = df[df['pca-one'] <= 100]  # (df['pca-one'].mean() + df['pca-one'].std())
df = df[df['pca-two'] <= 50]  # (df['pca-two'].mean() + df['pca-two'].std())
df = df[df['pca-one'] >= -100]  # (df['pca-one'].mean() - df['pca-one'].std())
df = df[df['pca-two'] >= -50]  # (df['pca-two'].mean() - df['pca-two'].std())
print('Size of the dataframe after outlier removal: {}'.format(df.shape))

pca_chart = ggplot( df.loc[:,:], aes(x='pca-one', y='pca-two', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by digit")

pca_chart.save('chart_pca_unet_adv_4.png', dpi=1080)

tsne_chart = ggplot( df_tsne, aes(x='x-tsne-pca', y='y-tsne-pca', color='name') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by Digit (PCA)")
tsne_chart.save('chart_tsne_unet_adv_4.png', dpi=1080)
Ejemplo n.º 52
0
from ggplot import aes, diamonds, geom_density, ggplot
import matplotlib.pyplot as plt

from bokeh import mpl
from bokeh.plotting import output_file, show

g = ggplot(diamonds, aes(x='price', color='cut')) + geom_density()
g.draw()

plt.title("Density ggplot-based plot in Bokeh.")

output_file("ggplot_density.html", title="ggplot_density.py example")

show(mpl.to_bokeh())
Ejemplo n.º 53
0
# -*- coding: utf-8 -*-
from ggplot import ggplot, aes, geom_point, geom_line, ggtitle, xlab, ylab

data = []
xvar = 'X'
yvar = 'Y'

print ggplot(
    data,
    aes(x='yearID', y='HR')) + \
      geom_point(color='red') + \
      geom_line(color='red') + \
      ggtitle('Number of HR by year') + \
      xlab('Year') + \
      ylab('Number of HR')
            where ULEZ = true and no2_ppb <> -999
        )
    )
    group by pod_id_location
""")
qry_job = bqclient.query(qry_str, location='EU', job_config=job_config)
#save result as dataframe
df = qry_job.to_dataframe()
df_long = df.melt(id_vars=['pod_str', 'pod_idx'],
                  value_vars=['p05', 'p25', 'med', 'p75', 'p95'],
                  var_name='yparam',
                  value_name='value')
#plots
#plt1 = gg.ggplot(df, gg.aes(x='date_UTC',y='no2_ppb'))+gg.geom_line()+gg.xlab('Time')+gg.ylab('NO2 (ppb)')+gg.theme_bw()+gg.facet_wrap('pod_id_location',scales='free_y')
#plt1.save(filename = r'.\charts\ulezpodts.png', width=None, height=None, dpi=200)
plt2 = gg.ggplot(df_long, gg.aes(
    x='pod_str', y='value', color='yparam')) + gg.geom_point() + gg.xlab(
        'pod') + gg.ylab('NO2 (as % of median)') + gg.theme_bw() + gg.theme(
            figure_size=(12, 6)) + gg.scale_x_discrete()
plt2.save(filename=r'.\charts\ulezpodvar.png', width=10, height=6, dpi=200)

#repeat for mobile data using segid instead of podid where N = 10 and N = 40
#repeat for stationary data at mobile times
qry_str = ("""
    with cte0 as (
    --all data, ULEZ pods with 6000 hrs
    select date_UTC, a.pod_id_location, no2_ppb
    from AQMesh.NO2_scaled_hightimeres_ppb_20180901_20190630 a
    join AQMesh.NO2_site_metadata_v2_1_20180901_20190630 b
    on a.pod_id_location=b.pod_id_location
    where ULEZ = true and no2_ppb <> -999
    and a.pod_id_location in 
Ejemplo n.º 55
0
def graph(y):
    data = pd.DataFrame({'iteration': list(range(len(y))), 'RMSE': y})
    p = gg.ggplot(gg.aes(x='iteration', y='RMSE'),
                  data=data) + gg.geom_point() + gg.geom_line()
    return p
slope = 0.3
x = randn(num) * 50. + 150.0 
y = randn(num) * 5 + x * slope
plt.scatter(x, y, c='b')


# In[72]:

# plt.scatter(x[(y < 1) & (y > -1)], y[(y < 1) & (y > -1)], c='r')
# np.argsort, np.sort, complicated index slicing
dframe = pd.DataFrame({'x': x, 'y': y})
g = sns.jointplot('x', 'y', data=dframe, kind="reg")


# ## Grab Python version of ggplot http://ggplot.yhathq.com/

# In[73]:

from ggplot import ggplot, aes, geom_line, stat_smooth, geom_dotplot, geom_point


# In[74]:

ggplot(aes(x='x', y='y'), data=dframe) + geom_point() + stat_smooth(colour='blue', span=0.2)


# In[ ]:



Ejemplo n.º 57
0
from ggplot import aes, geom_line, ggplot, meat
import matplotlib.pyplot as plt

from bokeh import mpl
from bokeh.plotting import output_file, show

g = ggplot(aes(x='date', y='beef'), data=meat) + geom_line()
g.make()

plt.title("Line ggplot-based plot in Bokeh.")

output_file("ggplot_line.html", title="ggplot_line.py example")

show(mpl.to_bokeh())
Ejemplo n.º 58
0
    # create a new long-form dataframe for clean plotting purposes
    values_dict = {
        "significant": coefficients[feature]["significant"],
        "insignificant": coefficients[feature]["unsignificant"]
    }
    df = pd.DataFrame.from_dict(values_dict, orient='index')
    df = df.transpose()
    df = pd.melt(df)
    df['feature'] = feature
    dfs_to_concat.append(df)

master_df = pd.concat(dfs_to_concat)

# histogram
p = ggplot(aes(x='value', fill='variable', color='variable'), data=master_df)
p += geom_histogram(bins=25, alpha=0.5)
p += scale_x_continuous(limits=(-25, 25))
p += ggtitle("sarimax coefficient magnitude distribution")
p += facet_wrap("feature", ncol=3, scales="free")
p += labs(x=" ", y=" ")

# visuals
t = theme_gray()
t._rcParams['font.size'] = 10
t._rcParams['font.family'] = 'monospace'

p += t
p.save("arima_1/" + "histogram.png")

# boxplot
Ejemplo n.º 59
0
min(vehicles.year)
max(vehicles.year)
pd.value_counts(vehicles.fuelType1)
pd.value_counts(vehicles.trany)
vehicles["trany2"] = vehicles.trany.str[0]
pd.value_counts(vehicles.trany2)

#%% step 1 ~ 4 on Page 202
from ggplot import ggplot, aes, geom_point, xlab, ylab, ggtitle

grouped = vehicles.groupby("year")
averaged = grouped['comb08', 'highway08', 'city08'].agg([np.mean])
averaged.columns = ['comb08_mean', 'highway08_mean', 'city08_mean']
averaged['year'] = averaged.index

print(ggplot(averaged, aes('year', 'comb08_mean')) +
      geom_point(color='steelblue') +
      xlab('Year') +
      ylab('Average MPG') +
      ggtitle('All cars'))

#%% step 5
criteria1 = vehicles.fuelType1.isin(['Regular Gasoline', 'Prenium Gasoline', 'Midgrade Gasoline'])
criteria2 = vehicles.fuelType2.isnull()
criteria3 = vehicles.atvType != 'Hybrid'
vehicles_non_hybrid = vehicles[criteria1 & criteria2 & criteria3]
len(vehicles_non_hybrid)

#%% step 6
grouped = vehicles_non_hybrid.groupby(['year'])
averaged = grouped['comb08'].agg([np.mean])
Ejemplo n.º 60
0
soup = BeautifulSoup(data.text,'html.parser')
weather_observations = soup.find('table',{'summary': "Daily Weather Observations for Brisbane, Queensland for November 2018"})
tbody = weather_observations.find('tbody')

daily_min = []
daily_max = []
for tr in tbody.find_all('tr'):
    daily_min.append(tofloat(tr.find_all('td')[1].text.strip()))
    daily_max.append(tofloat(tr.find_all('td')[2].text.strip()))

# data = [[a,b] for a,b in zip(daily_min,daily_max)]

# convert from list to DataFrame
daily_temperature = pd.DataFrame(data=[[a,b,c] for a,b,c in zip(range(1,len(daily_min)+1),daily_min,daily_max)], columns=['day','daily min','daily max'])

# print(daily_temperature)

# making plots
myplot = gg.ggplot(gg.aes(x='day',y='daily_max'), data=daily_temperature) +\
    gg.geom_point()

# different way of making data frame and plots
labels = ['daily_min' for a in range(len(daily_min))] + ['daily_max' for a in range(len(daily_max))]
weather_data = pd.DataFrame(data=[[a,b,c] for a,b,c in zip(itertools.chain(range(1,len(daily_min)+1),range(1,len(daily_max)+1)),daily_min+daily_max,labels)],columns = ['day','temp','min-max'])

print(weather_data)
myplot = gg.ggplot(gg.aes(x='day',y='temp',color='min-max'), data=weather_data) +\
    gg.geom_point()

myplot.show()