def main(file_path): # Validate raw data path if not os.path.exists(file_path): LOG_ERROR('Could not find file: {}'.format(file_path)) return # Validate raw data file type if not file_path.endswith('.pkl'): LOG_ERROR('File path must be a pickle file') return with open(file_path, 'rb') as f: LOG_INFO('Parsing pickle file: {}'.format(file_path)) conversation = pickle.load(f) LOG_INFO('Found conversation: {}'.format(conversation['conversation_name'])) df = pd.DataFrame(conversation['messages']) df.columns = ['Timestamp', 'Type', 'Participant'] # df['Datetime'] = pd.to_datetime(df['Timestamp']) df['Datetime'] = df['Timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(float(x)).toordinal()) histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=2) \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle(conversation['conversation_name']) \ + ggplot.ylab('Number of messages') \ + ggplot.xlab('Date') print(histogram)
def plot_deg_distrib(G): (in_deg, out_deg, deg) = wa.degree_distribution(G) in_deg_series = pd.Series(in_deg) out_deg_series = pd.Series(out_deg) in_out = { 'in_deg': in_deg_series, 'out_deg': out_deg_series } df = pd.DataFrame(in_out) df = pd.melt(df) p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'), data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1) print p
def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None, groups=None, legend=True): palette = self.__default_options__.get('palette', None) if palette is None else palette return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \ geom_histogram(alpha=0.6, breaks=bins, position="fill") + \ self._palette(palette) + \ ggtitle(title) + \ scale_y_continuous(name="Count (%s)" % values)
def plot_bin_dists(df, bin_def="distance_bin <= 500"): plt.rcParams['figure.figsize'] = np.array([16, 12]) * 0.65 p = gp.ggplot(gp.aes(x='R2'), data=df.query(bin_def)) p = p + gp.geom_histogram( fill='coral') + gp.facet_wrap("distance_bin") + gp.theme_seaborn( context='talk') + gp.ggtitle(bin_def) return p
def plot_deg_distrib(G): (in_deg, out_deg, deg) = wa.degree_distribution(G) in_deg_series = pd.Series(in_deg) out_deg_series = pd.Series(out_deg) in_out = {'in_deg': in_deg_series, 'out_deg': out_deg_series} df = pd.DataFrame(in_out) df = pd.melt(df) p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'), data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1) print p
def hist_chart(self, conn, column, table_chosen, title): data_df = dfile.single_selector(conn=conn, table=table_chosen, column=column) hist_plot = ggplot( aes(x=column), data=data_df) + geom_histogram() + theme_gray() + labs(title=title) now = datetime.datetime.now() b = now print(b) print(b - a) print(hist_plot)
def render(data, bin_width, plot_density=False): if plot_density: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \ + ggplot.geom_density() \ + ggplot.scale_x_date(labels='%b %Y') \ + ggplot.ggtitle('Conversation Densities') \ + ggplot.ylab('Density') \ + ggplot.xlab('Date') else: plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \ + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \ + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \ + ggplot.ggtitle('Message Breakdown') \ + ggplot.ylab('Number of Messages') \ + ggplot.xlab('Date') print(plot)
def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None, groups=None, legend=True): palette = self.__default_options__.get( 'palette', None) if palette is None else palette return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \ geom_histogram(alpha=0.6, breaks=bins, position="fill") + \ self._palette(palette) + \ ggtitle(title) + \ scale_y_continuous(name="Count (%s)" % values)
import pandas as pd meat = gp.meat p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.ggtitle(u'散点图') print (p) p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_line(color='blue')+gp.ggtitle(u'折线图') print (p) p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.geom_line(color='blue')+gp.ggtitle(u'散点图+折线图') print (p) # 将想要表达的变量组成一列 meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date') # meat_lng包含了date,value(变量的值组成的列),variable(变量的名称组成的列) p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+\ gp.geom_point()+gp.geom_line() print (p) meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date') p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+gp.geom_point()+gp.facet_wrap('variable') print (p) p = gp.ggplot(gp.aes(x='beef'),data=meat)+gp.geom_histogram() print (p) meat_lng = pd.melt(meat[['date','beef','pork']],id_vars='date') p = gp.ggplot(gp.aes(x='value'),data=meat_lng)+gp.facet_wrap('variable')+gp.geom_histogram() print (p)
def plot_weather_data(turnstile_weather): ''' You are passed in a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make a data visualization focused on the MTA and weather data we used in assignment #3. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3 of the actual data in the turnstile_weather dataframe ''' #Ridership by day of week - Option 1 (Entries by Day of Week) #pd.options.mode.chained_assignment = None # default='warn' #turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday) #plot = gg.ggplot(turnstile_weather, aes('weekday','ENTRIESn_hourly')) + ggtitle('Entries by Day of Week') + xlab('Day of Week') + ylab('Number of Entries') +gg.geom_histogram(stat = "bar", position = "stack")+ scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]) #Ridership by day of week - Option 2 (Avg number of Entries by Day of Week) pd.options.mode.chained_assignment = None # default='warn' turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday) averageentries_on_weekday = turnstile_weather.groupby('weekday', as_index=False).ENTRIESn_hourly.mean() averageentries_on_weekday.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True) plot = gg.ggplot(averageentries_on_weekday, aes('weekday', 'avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by Day of Week') + xlab('Day of Week') + ylab('avg number of Entries') + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]) #Ridership by Unit(Station) - Option 3 (Entries by UNIT) #pd.options.mode.chained_assignment = None # default='warn' #plot = gg.ggplot(turnstile_weather, aes('UNIT','ENTRIESn_hourly')) + ggtitle('Entries by UNIT') + xlab('UNIT') + ylab('Number of Entries') +gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 100), breaks=range(0, 100, 1)) #Ridership by day of week - Option 4 (Avg number of Entries by UNIT) #pd.options.mode.chained_assignment = None # default='warn' #averageentries_unit = turnstile_weather.groupby('UNIT', as_index=False).ENTRIESn_hourly.mean() #averageentries_unit.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True) #plot = gg.ggplot(averageentries_unit, aes('UNIT','avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by UNIT') + xlab('UNIT') + ylab('avg number of Entries') + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 50), breaks=range(0, 50, 1)) return plot
# create a new long-form dataframe for clean plotting purposes values_dict = { "significant": coefficients[feature]["significant"], "insignificant": coefficients[feature]["unsignificant"] } df = pd.DataFrame.from_dict(values_dict, orient='index') df = df.transpose() df = pd.melt(df) df['feature'] = feature dfs_to_concat.append(df) master_df = pd.concat(dfs_to_concat) # histogram p = ggplot(aes(x='value', fill='variable', color='variable'), data=master_df) p += geom_histogram(bins=25, alpha=0.5) p += scale_x_continuous(limits=(-25, 25)) p += ggtitle("sarimax coefficient magnitude distribution") p += facet_wrap("feature", ncol=3, scales="free") p += labs(x=" ", y=" ") # visuals t = theme_gray() t._rcParams['font.size'] = 10 t._rcParams['font.family'] = 'monospace' p += t p.save("arima_1/" + "histogram.png") # boxplot p = ggplot(aes(x='variable', y='value'), data=master_df)
print(subjects_words_count.describe()) #%% import ggplot as gg df = pd.DataFrame(subjects_words_count, columns = ["count"]) hist = gg.ggplot(df, gg.aes(x = "count")) hist += gg.xlab("# of words") +\ gg.ylab("Frequency") +\ gg.ggtitle("Frequency of words") hist += gg.geom_vline(x = df.mean(), color="red") hist += gg.geom_vline(x = df.median(), color="blue") hist += gg.geom_density(color="green") hist += gg.geom_histogram(binwidth=1, color="grey") hist #%% # 1st attemtp to classify subjects per tag X_raw_train = subjects_train X_raw_test = subjects_test Y_train = raw_data_train.target Y_test = raw_data_test.target target_names = raw_data_train.target_names def get_target_name(index):
def test_ggtitle(self): p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ggtitle("TEST") self.assertEqual(p.title, "TEST")
def test_ylab(self): p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ylab("TEST") self.assertEqual(p.ylab, "TEST")
# 升级pip, 以免安装.whl失败。注意 .whl文件名不能修改,不要使用迅雷下载 # pip install --upgrade setuptools # 安装numpy,scipy,windows下需要编译,可以在http://www.lfd.uci.edu/~gohlke/pythonlibs/ 下载编译包.whl安装。 # pip install .whl # windows下需要安装VC++ 14.0,http://landinghub.visualstudio.com/visual-cpp-build-tools ,在该网站下载 Visual C++ Build Tools 2015 # 安装ggplot # pip install -i https://pypi.tuna.tsinghua.edu.cn/simple ggplot # 绘制散点图 import ggplot as gp meat = gp.meat # 使用ggplot自带的测试数据 p = gp.ggplot( gp.aes( x='date', # 指定x轴数据 y='beef', # 指定y轴数据 color='beef'), # 指定填充颜色 data=meat) # 指定数据集 p + gp.geom_line() # 绘制折线图 p + gp.geom_point() # 绘制散点图 # 绘制分面图 gp.ggplot(gp.aes(x='carat', y='price', color='color'), data=gp.diamonds) + gp.geom_point() + gp.facet_wrap('cut') # 绘制直方图 gp.ggplot(gp.aes(x='price'), data=gp.diamonds) + gp.geom_histogram()
print("Mean of DI: " + str(df_pre.DI.mean())) print("MAD DI-DS: " + str((df_pre["DI"] - df_pre["DS"]).mean())) # #### Get to know the data set # In[11]: # import additional dependencies for plotting from ggplot import geom_histogram, geom_density from ggplot import * # In[12]: # Distribution of target variable ggplot(aes(x='DI', ), data=df_pre) + geom_histogram( binwidth=2, alpha=0.6, fill="#008080", color="#20b2aa") + xlab("DI") + ggtitle("Distribution of DI") # # 2. Outlier detection and handling # In[13]: # possible negative values in distribution # check for negative values of DI df_pre[df_pre["DI"] < 0] # In[14]: # duration of transportion cannot be negative # delete negative occurences of DI df_pre = df_pre[~df_pre["DI"] < 0]
tile(w_from_figure_wh_ratio, norm(data)), '%s-layer-acts-%s-%s-(i=%s)' % (img_desc, layer, show_tuple_tight(data.shape), batch_i), ) conv_layers = filter(lambda (layer, acts): len(acts.data.shape) == 4, net.blobs.items()) fc_layers = filter(lambda (layer, acts): len(acts.data.shape) != 4, net.blobs.items()) # Plot conv acts for layer, acts in conv_layers: plot_conv_acts(layer, acts) # Plot fc acts df = pd.concat([ pd.DataFrame({'act': acts.data[batch_i], 'layer': layer}).reset_index() for layer, acts in fc_layers ]) plot_gg(gg_layer( gg.ggplot(df, gg.aes(y='act', x='index')), gg.geom_point(alpha=.5), gg.facet_wrap(x='layer', scales='free'), gg.ggtitle('%s layer acts fc/prob points (i=%s)' % (img_desc, batch_i)), )) plot_gg(gg_layer( gg.ggplot(df, gg.aes(x='act')), gg.geom_histogram(bins=25, size=0), gg.facet_wrap(x='layer', scales='free'), gg.scale_y_log(), gg.ylim(low=0.1), gg.ggtitle('%s layer acts fc/prob histo (i=%s)' % (img_desc, batch_i)), ))