def test_axis_shared(self): # GH4089 import matplotlib.pyplot as plt def tick_text(tl): return [x.get_text() for x in tl] n = 100 df = DataFrame( { "gender": np.array(["Male", "Female"])[random.randint(2, size=n)], "height": random.normal(66, 4, size=n), "weight": random.normal(161, 32, size=n), } ) ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True) self.assert_(ax1._shared_x_axes.joined(ax1, ax2)) self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) self.assert_(ax2._shared_x_axes.joined(ax1, ax2)) self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) plt.close("all") ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True) self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) self.assert_(ax1._shared_y_axes.joined(ax1, ax2)) self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) self.assert_(ax2._shared_y_axes.joined(ax1, ax2)) plt.close("all") ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) self.assert_(ax1._shared_x_axes.joined(ax1, ax2)) self.assert_(ax1._shared_y_axes.joined(ax1, ax2)) self.assert_(ax2._shared_x_axes.joined(ax1, ax2)) self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
def test_axis_shared(self): # GH4089 import matplotlib.pyplot as plt def tick_text(tl): return [x.get_text() for x in tl] n = 100 df = DataFrame({'gender': np.array(['Male', 'Female'])[random.randint(2, size=n)], 'height': random.normal(66, 4, size=n), 'weight': random.normal(161, 32, size=n)}) ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) self.assert_(ax1._shared_x_axes.joined(ax1, ax2)) self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) self.assert_(ax2._shared_x_axes.joined(ax1, ax2)) self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) plt.close('all') ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) self.assert_(ax1._shared_y_axes.joined(ax1, ax2)) self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) self.assert_(ax2._shared_y_axes.joined(ax1, ax2)) plt.close('all') ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, sharey=True) self.assert_(ax1._shared_x_axes.joined(ax1, ax2)) self.assert_(ax1._shared_y_axes.joined(ax1, ax2)) self.assert_(ax2._shared_x_axes.joined(ax1, ax2)) self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
def test_grouped_hist_layout(self): import matplotlib.pyplot as plt n = 100 df = DataFrame({'gender': np.array(['Male', 'Female'])[random.randint(2, size=n)], 'height': random.normal(66, 4, size=n), 'weight': random.normal(161, 32, size=n), 'category': random.randint(4, size=n)}) self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, layout=(1, 1)) self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, layout=(1,)) self.assertRaises(ValueError, df.hist, column='height', by=df.category, layout=(1, 3)) self.assertRaises(ValueError, df.hist, column='height', by=df.category, layout=(2, 1)) self.assertEqual(df.hist(column='height', by=df.gender, layout=(2, 1)).shape, (2,)) plt.close('all') self.assertEqual(df.hist(column='height', by=df.category, layout=(4, 1)).shape, (4,)) plt.close('all') self.assertEqual(df.hist(column='height', by=df.category, layout=(4, 2)).shape, (4, 2))
def test_hist(self): import matplotlib.pyplot as plt df = DataFrame(np.random.randn(100, 4)) _check_plot_works(df.hist) _check_plot_works(df.hist, grid=False) # make sure layout is handled df = DataFrame(np.random.randn(100, 3)) _check_plot_works(df.hist) axes = df.hist(grid=False) self.assert_(not axes[1, 1].get_visible()) df = DataFrame(np.random.randn(100, 1)) _check_plot_works(df.hist) # make sure layout is handled df = DataFrame(np.random.randn(100, 6)) _check_plot_works(df.hist) # make sure sharex, sharey is handled _check_plot_works(df.hist, sharex=True, sharey=True) # make sure xlabelsize and xrot are handled ser = df[0] xf, yf = 20, 20 xrot, yrot = 30, 30 ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot) xf, yf = 20, 20 xrot, yrot = 30, 30 axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) for i, ax in enumerate(axes.ravel()): if i < len(df.columns): ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot) plt.close('all') # make sure kwargs to hist are handled ax = ser.hist(normed=True, cumulative=True, bins=4) # height of last bin (index 5) must be 1.0 self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0) plt.close('all') ax = ser.hist(log=True) # scale of y must be 'log' self.assert_(ax.get_yscale() == 'log') plt.close('all') # propagate attr exception from matplotlib.Axes.hist self.assertRaises(AttributeError, ser.hist, foo='bar')
def test_hist_layout(self): import matplotlib.pyplot as plt df = DataFrame(randn(100, 4)) layout_to_expected_size = ( {"layout": None, "expected_size": (2, 2)}, # default is 2x2 {"layout": (2, 2), "expected_size": (2, 2)}, {"layout": (4, 1), "expected_size": (4, 1)}, {"layout": (1, 4), "expected_size": (1, 4)}, {"layout": (3, 3), "expected_size": (3, 3)}, ) for layout_test in layout_to_expected_size: ax = df.hist(layout=layout_test["layout"]) self.assertEqual(len(ax), layout_test["expected_size"][0]) self.assertEqual(len(ax[0]), layout_test["expected_size"][1]) # layout too small for all 4 plots with tm.assertRaises(ValueError): df.hist(layout=(1, 1)) # invalid format for layout with tm.assertRaises(ValueError): df.hist(layout=(1,))
def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle df = DataFrame(randn(500, 2), columns=['A', 'B']) df['C'] = np.random.randint(0, 4, 500) df['D'] = ['X'] * 500 axes = grouped_hist(df.A, by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() axes = df.hist(by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() # group by a key with single value axes = df.hist(by='D', rot=30) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) self._check_ticks_props(axes, xrot=30) tm.close() # make sure kwargs to hist are handled xf, yf = 20, 18 xrot, yrot = 30, 40 if _mpl_ge_2_2_0(): kwargs = {"density": True} else: kwargs = {"normed": True} axes = grouped_hist(df.A, by=df.C, cumulative=True, bins=4, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot, **kwargs) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) tm.close() axes = grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' self._check_ax_scales(axes, yaxis='log') tm.close() # propagate attr exception from matplotlib.Axes.hist with pytest.raises(AttributeError): grouped_hist(df.A, by=df.C, foo='bar') with tm.assert_produces_warning(FutureWarning): df.hist(by='C', figsize='default')
def test_grouped_hist(self): import matplotlib.pyplot as plt df = DataFrame(randn(500, 2), columns=["A", "B"]) df["C"] = np.random.randint(0, 4, 500) axes = plotting.grouped_hist(df.A, by=df.C) self.assertEqual(len(axes.ravel()), 4) tm.close() axes = df.hist(by=df.C) self.assertEqual(axes.ndim, 2) self.assertEqual(len(axes.ravel()), 4) for ax in axes.ravel(): self.assert_(len(ax.patches) > 0) tm.close() # make sure kwargs to hist are handled axes = plotting.grouped_hist(df.A, by=df.C, normed=True, cumulative=True, bins=4) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): height = ax.get_children()[5].get_height() self.assertAlmostEqual(height, 1.0) tm.close() axes = plotting.grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' for ax in axes.ravel(): self.assertEqual(ax.get_yscale(), "log") tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.assertRaises(AttributeError): plotting.grouped_hist(df.A, by=df.C, foo="bar")
def test_grouped_hist(self): import matplotlib.pyplot as plt df = DataFrame(np.random.randn(500, 2), columns=['A', 'B']) df['C'] = np.random.randint(0, 4, 500) axes = plotting.grouped_hist(df.A, by=df.C) self.assert_(len(axes.ravel()) == 4) plt.close('all') axes = df.hist(by=df.C) self.assert_(axes.ndim == 2) self.assert_(len(axes.ravel()) == 4) for ax in axes.ravel(): self.assert_(len(ax.patches) > 0) plt.close('all') # make sure kwargs to hist are handled axes = plotting.grouped_hist(df.A, by=df.C, normed=True, cumulative=True, bins=4) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): height = ax.get_children()[5].get_height() self.assertAlmostEqual(height, 1.0) plt.close('all') axes = plotting.grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' for ax in axes.ravel(): self.assert_(ax.get_yscale() == 'log') plt.close('all') # propagate attr exception from matplotlib.Axes.hist self.assertRaises(AttributeError, plotting.grouped_hist, df.A, by=df.C, foo='bar')
def test_hist_df_legacy(self): from matplotlib.patches import Rectangle _check_plot_works(self.hist_df.hist) # make sure layout is handled df = DataFrame(randn(100, 3)) axes = _check_plot_works(df.hist, grid=False) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) self.assertFalse(axes[1, 1].get_visible()) df = DataFrame(randn(100, 1)) _check_plot_works(df.hist) # make sure layout is handled df = DataFrame(randn(100, 6)) axes = _check_plot_works(df.hist, layout=(4, 2)) self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) # make sure sharex, sharey is handled _check_plot_works(df.hist, sharex=True, sharey=True) # handle figsize arg _check_plot_works(df.hist, figsize=(8, 10)) # check bins argument _check_plot_works(df.hist, bins=5) # make sure xlabelsize and xrot are handled ser = df[0] xf, yf = 20, 18 xrot, yrot = 30, 40 axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) xf, yf = 20, 18 xrot, yrot = 30, 40 axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) tm.close() # make sure kwargs to hist are handled ax = ser.hist(normed=True, cumulative=True, bins=4) # height of last bin (index 5) must be 1.0 rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] self.assertAlmostEqual(rects[-1].get_height(), 1.0) tm.close() ax = ser.hist(log=True) # scale of y must be 'log' self._check_ax_scales(ax, yaxis='log') tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.assertRaises(AttributeError): ser.hist(foo='bar')
def test_hist_layout(self): df = DataFrame(randn(100, 3)) layout_to_expected_size = ( {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 {'layout': (2, 2), 'expected_size': (2, 2)}, {'layout': (4, 1), 'expected_size': (4, 1)}, {'layout': (1, 4), 'expected_size': (1, 4)}, {'layout': (3, 3), 'expected_size': (3, 3)}, {'layout': (-1, 4), 'expected_size': (1, 4)}, {'layout': (4, -1), 'expected_size': (4, 1)}, {'layout': (-1, 2), 'expected_size': (2, 2)}, {'layout': (2, -1), 'expected_size': (2, 2)} ) for layout_test in layout_to_expected_size: axes = df.hist(layout=layout_test['layout']) expected = layout_test['expected_size'] self._check_axes_shape(axes, axes_num=3, layout=expected) # layout too small for all 4 plots with tm.assertRaises(ValueError): df.hist(layout=(1, 1)) # invalid format for layout with tm.assertRaises(ValueError): df.hist(layout=(1,)) with tm.assertRaises(ValueError): df.hist(layout=(-1, -1))
def test_grouped_hist_legacy(self): df = DataFrame(randn(500, 2), columns=['A', 'B']) df['C'] = np.random.randint(0, 4, 500) df['D'] = ['X'] * 500 axes = plotting.grouped_hist(df.A, by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() axes = df.hist(by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() # group by a key with single value axes = df.hist(by='D', rot=30) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) self._check_ticks_props(axes, xrot=30) tm.close() # make sure kwargs to hist are handled xf, yf = 20, 18 xrot, yrot = 30, 40 axes = plotting.grouped_hist(df.A, by=df.C, normed=True, cumulative=True, bins=4, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): height = ax.get_children()[5].get_height() self.assertAlmostEqual(height, 1.0) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) tm.close() axes = plotting.grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' self._check_ax_scales(axes, yaxis='log') tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.assertRaises(AttributeError): plotting.grouped_hist(df.A, by=df.C, foo='bar') with tm.assert_produces_warning(FutureWarning): df.hist(by='C', figsize='default')
def test_hist(self): df = DataFrame(np.random.randn(100, 4)) _check_plot_works(df.hist) _check_plot_works(df.hist, grid=False) #make sure layout is handled df = DataFrame(np.random.randn(100, 3)) _check_plot_works(df.hist) axes = df.hist(grid=False) self.assert_(not axes[1, 1].get_visible()) df = DataFrame(np.random.randn(100, 1)) _check_plot_works(df.hist) #make sure layout is handled df = DataFrame(np.random.randn(100, 6)) _check_plot_works(df.hist) #make sure sharex, sharey is handled _check_plot_works(df.hist, sharex=True, sharey=True) #make sure kwargs are handled ser = df[0] xf, yf = 20, 20 xrot, yrot = 30, 30 ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot) xf, yf = 20, 20 xrot, yrot = 30, 30 axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) for i, ax in enumerate(axes.ravel()): if i < len(df.columns): ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot)
def test_grouped_hist_layout(self): import matplotlib.pyplot as plt n = 100 df = DataFrame( { "gender": np.array(["Male", "Female"])[random.randint(2, size=n)], "height": random.normal(66, 4, size=n), "weight": random.normal(161, 32, size=n), "category": random.randint(4, size=n), } ) self.assertRaises(ValueError, df.hist, column="weight", by=df.gender, layout=(1, 1)) self.assertRaises(ValueError, df.hist, column="weight", by=df.gender, layout=(1,)) self.assertRaises(ValueError, df.hist, column="height", by=df.category, layout=(1, 3)) self.assertRaises(ValueError, df.hist, column="height", by=df.category, layout=(2, 1)) self.assertEqual(df.hist(column="height", by=df.gender, layout=(2, 1)).shape, (2,)) plt.close("all") self.assertEqual(df.hist(column="height", by=df.category, layout=(4, 1)).shape, (4,)) plt.close("all") self.assertEqual(df.hist(column="height", by=df.category, layout=(4, 2)).shape, (4, 2))
def test_axis_share_xy(self): n = 100 df = DataFrame({'gender': tm.choice(['Male', 'Female'], size=n), 'height': random.normal(66, 4, size=n), 'weight': random.normal(161, 32, size=n)}) ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, sharey=True) # share both x and y self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2))
def test_grouped_hist(self): import matplotlib.pyplot as plt df = DataFrame(np.random.randn(500, 2), columns=['A', 'B']) df['C'] = np.random.randint(0, 4, 500) axes = plotting.grouped_hist(df.A, by=df.C) self.assert_(len(axes.ravel()) == 4) plt.close('all') axes = df.hist(by=df.C) self.assert_(axes.ndim == 2) self.assert_(len(axes.ravel()) == 4) for ax in axes.ravel(): self.assert_(len(ax.patches) > 0)
def gs(str,list): s = list t= pd.read_csv(str,usecols= s) w=DataFrame(t) try: plt.scatter(w[s[0]],w[s[1]],color='red') plt.show() except: pass try: w.hist() plt.show() w.plot(kind='box',by=list) plt.show() except: pass t=w.applymap(np.isreal) print t b= ''.join(s) for i in t[b]: if i==False: a=(w[b].value_counts()) a.plot(kind='bar') plt.show() break
def test_axis_share_xy(self): n = 100 df = DataFrame( { "gender": tm.choice(["Male", "Female"], size=n), "height": random.normal(66, 4, size=n), "weight": random.normal(161, 32, size=n), } ) ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) # share both x and y self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2))
def test_hist_layout(self): import matplotlib.pyplot as plt plt.close('all') df = DataFrame(randn(100, 4)) layout_to_expected_size = ( {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 {'layout': (2, 2), 'expected_size': (2, 2)}, {'layout': (4, 1), 'expected_size': (4, 1)}, {'layout': (1, 4), 'expected_size': (1, 4)}, {'layout': (3, 3), 'expected_size': (3, 3)}, ) for layout_test in layout_to_expected_size: ax = df.hist(layout=layout_test['layout']) self.assert_(len(ax) == layout_test['expected_size'][0]) self.assert_(len(ax[0]) == layout_test['expected_size'][1]) # layout too small for all 4 plots self.assertRaises(ValueError, df.hist, layout=(1, 1)) # invalid format for layout self.assertRaises(ValueError, df.hist, layout=(1,))
def test_hist_layout(self): import matplotlib.pyplot as plt plt.close("all") df = DataFrame(np.random.randn(100, 4)) layout_to_expected_size = ( {"layout": None, "expected_size": (2, 2)}, # default is 2x2 {"layout": (2, 2), "expected_size": (2, 2)}, {"layout": (4, 1), "expected_size": (4, 1)}, {"layout": (1, 4), "expected_size": (1, 4)}, {"layout": (3, 3), "expected_size": (3, 3)}, ) for layout_test in layout_to_expected_size: ax = df.hist(layout=layout_test["layout"]) self.assert_(len(ax) == layout_test["expected_size"][0]) self.assert_(len(ax[0]) == layout_test["expected_size"][1]) # layout too small for all 4 plots self.assertRaises(ValueError, df.hist, layout=(1, 1)) # invalid format for layout self.assertRaises(ValueError, df.hist, layout=(1,))
''' import pandas from pandas import Series, DataFrame import code import numpy as np import matplotlib.pyplot as plt import sys import csv if __name__ == '__main__': with open("stats.csv" if len(sys.argv) < 2 else sys.argv[1]) as f: reader = csv.reader(f) data = [(float(peak), float(iqr)) for peak, iqr in reader] md = Series(zip(*data)[0]) iqrd = Series(zip(*data)[1]) df = DataFrame(data=dict(max_deltas=md, iqr_deltas=iqrd)) #log_df = np.log(df) / np.log(2) #log_df.columns = ["lg {}".format(foo) for foo in log_df.columns] #log_df.hist(normed=True) df.hist(normed=True) plt.show() code.interact(local=locals())
print(df.corr()) ### 8. Merge and Join ### print(df) other = DataFrame({'str_col': ['a', 'b'], 'some_val': [1, 2]}) print(other) print(pd.merge(df, other, on='str_col', how='inner')) print(pd.merge(df, other, on='str_col', how='outer')) print(pd.merge(df, other, on='str_col', how='left')) print(pd.merge(df, other, on='str_col', how='right')) ### 9. Plot ### plot_df = DataFrame(np.random.randn(1000, 2), columns=['x', 'y']) plot_df['y'] = plot_df['y'].map(lambda x: x + 1) plot_df.plot() ### plot not working???? ### plot_df.hist() ### plot not working???? ### ### 10. Scikit-learn conversion ### print(df) print(df.values[:, :-1]) #print(df.values[:,:-1].astype(float32)) not working? input()
} }]))), "users with tweets") # And finally we plot the lexical diversity: # In[38]: cursor = dbclient.db_restT.diversity.aggregate([{ '$project': { '_id': 0, 'name': '$name', 'lex_div': '$lexical_diversity' } }]) lex_div = DataFrame(list(cursor)) lex_div.hist('lex_div', bins=50) # 2.3: Track unfollows # -- # Write a python program to create a db called db_followers that stores all the followers for all the users that you find in task 2.1. Then, write a program to find the un-followed friends after a week for the top 10 users( users that have the highest number of followers in task 2.1) since the time that you extracted the tweets. In other words, you need to look for the people following the top 10 users at time X (the time that you extracted the tweets) and then look at the people following the same top 10 users at a later time Y (one-week after X) to see who stopped following the top 10 users. # First, make a db/table of top RT'ed users in which to store follower stats # In[39]: dbclient.drop_database('db_followers') rows = [] for row in dbclient.db_tweets.top_retweets.aggregate([{ '$group': { '_id': '$user.id', 'name': {
def test_hist_layout(self): df = DataFrame(np.random.randn(100, 2)) df[2] = to_datetime( np.random.randint( self.start_date_to_int64, self.end_date_to_int64, size=100, dtype=np.int64, )) layout_to_expected_size = ( { "layout": None, "expected_size": (2, 2) }, # default is 2x2 { "layout": (2, 2), "expected_size": (2, 2) }, { "layout": (4, 1), "expected_size": (4, 1) }, { "layout": (1, 4), "expected_size": (1, 4) }, { "layout": (3, 3), "expected_size": (3, 3) }, { "layout": (-1, 4), "expected_size": (1, 4) }, { "layout": (4, -1), "expected_size": (4, 1) }, { "layout": (-1, 2), "expected_size": (2, 2) }, { "layout": (2, -1), "expected_size": (2, 2) }, ) for layout_test in layout_to_expected_size: axes = df.hist(layout=layout_test["layout"]) expected = layout_test["expected_size"] self._check_axes_shape(axes, axes_num=3, layout=expected) # layout too small for all 4 plots with pytest.raises(ValueError): df.hist(layout=(1, 1)) # invalid format for layout with pytest.raises(ValueError): df.hist(layout=(1, )) with pytest.raises(ValueError): df.hist(layout=(-1, -1))
#plt.title("Principal Component Analysis") #plt.savefig("PCA_July24_2_components") #plt.show() DS = DataFrame({'good': X_pca[:, 0], 'bad': X_pca[:, 1]}) #DS.plot.scatter(DS['good'], DS['bad']) #plt.scatter(X_pca[:,0], X_pca[:,1], y_train) #plt.show() #print DS #from pandas.plotting import scatter_matrix ## df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) DS.hist(grid=True, bins=1000) plt.title(" Histogram of Good v. Bad ") plt.show() #s = cPickle.dumps(cls) #sw = open('decisionTree_pickle','w') #sw.write(s) #print 'completed saving pickle file of Decision Tree model ' ###now to PCA anaysis... ### X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test)
def plot(prefix, header, row): x = {h: d for (h, d) in zip(header, row)} jobid = x["jobid"] del x["jobid"] result = [] for k in x: timeseries = x[k].split(":") timeseries = [float(x) for x in timeseries] if sum(timeseries) == 0: continue timeseries = [[k, x, s] for (s, x) in zip(timeseries, range(0, len(timeseries)))] result.extend(timeseries) if len(result) == 0: print("Empty job! Cannot plot!") return data = DataFrame(result, columns=["metrics", "segment", "value"]) groups = data.groupby(["metrics"]) metrics = DataFrame() labels = [] colors = [] style = [] for name, group in groups: style.append(linestyleMap[name] + markerMap[name]) colors.append(colorMap[name]) if name == "md_file_delete": name = "file_delete" if name == "md_file_create": name = "file_create" try: metrics[name] = pd.Series([x[2] for x in group.values]) except: print("Error processing %s with" % jobid) print(group.values) return labels.append(name) fsize = (8, 1 + 1.1 * len(labels)) fsizeFixed = (8, 2) fsizeHist = (8, 6.5) pyplot.close('all') if len(labels) < 4: ax = metrics.plot(legend=True, sharex=True, grid=True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style) ax.set_ylabel("Value") else: ax = metrics.plot(subplots=True, legend=False, sharex=True, grid=True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style) for (i, l) in zip(range(0, len(labels)), labels): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150) # Create a facetted grid #g = sns.FacetGrid(tips, col="time", margin_titles=True) #bins = np.linspace(0, 60, 13) #g.map(plt.hist, "total_bill", color="steelblue", bins=bins) ax = metrics.hist(grid=True, sharey=True, figsize=fsizeHist, bins=15, range=(0, 15)) pyplot.xlim(0, 15) pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150) # Plot first 30 segments if len(timeseries) <= 50: return if len(labels) < 4: ax = metrics.plot(legend=True, xlim=(0, 30), sharex=True, grid=True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style) ax.set_ylabel("Value") else: ax = metrics.plot(subplots=True, xlim=(0, 30), legend=False, sharex=True, grid=True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style) for (i, l) in zip(range(0, len(labels)), labels): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
plt.plot(N, ret_acum(ret_medio_ibov_sim.iloc[ini:fim].values), label="ibovespa") plt.title("LSTM Deep Learning: Portfolio Short Ibovespa") plt.xlabel("Dia") plt.ylabel("Retorno Acumulado") plt.legend(frameon=False) # scatter plot prob vs. return line = plt.figure() x = corr_prob_ret_sim.iloc[:, 0].values y = corr_prob_ret_sim.iloc[:, 1].values plt.plot(x, y, "o") # distribuição retornos e probabilidades corr_prob_ret_sim.hist(bins=100, grid=False, sharey=True) #pd.set_option('display.width', 100) #pd.set_option('precision', 6) #correlations = ret_medio_port_sim.corr(method='pearson') #correlations2 = corr_prob_ret_sim.corr(method='pearson') #print(correlations) #print(correlations2) # statistics model.summary() score_treino_sim.describe() score_trade_sim.describe() ret_medio_ibov_sim.iloc[ini:fim].describe() ret_medio_port_sim.iloc[ini:fim, :].describe() ret_medio_port_long_sim.iloc[ini:fim, :].describe()
dd_qtr_std=df_drawdowns.groupby(df_drawdowns.index.quarter).std() #Look at drawdowns on a monthly basis mth_mean=df_drawdowns.resample('M', how='mean',kind='period') dd_monthly_mean=df_drawdowns.groupby(df_drawdowns.index.month).mean() dd_monthly_std=df_drawdowns.groupby(df_drawdowns.index.month).std() #Look at one year-2014 dd_2014=df_drawdowns['2014-01-01':'2014-12-31'] dd_2014_ri=dd_2014.mean().reset_index(name='Average Drawdown in 2014') #Creates histograms based on drawdown magnitudes bins_dd = np.linspace(0,30,61) dd_hist=df_drawdowns dd_hist.hist(bins=bins_dd, alpha=0.75,color='green',normed=True) dd_hist.plot(kind='kde',style='k--') ''' Drawdown analysis-This code plots a histogram of a stocks drawdown length characteristics. Ensure that stock ticker used in this function has already been placed in the ticker list. ''' stock_dd_length=calc_drawdown_local('WAT',63) dd_len_hist=DataFrame(stock_dd_length) bins_len_dd=np.linspace(0,30,31) dd_len_hist.hist(bins=bins_len_dd, alpha=0.55, color='purple',normed=True) plt.title('Drawdown lengths - WAT') dd_len_hist.describe()
print(len(list( dbclient.db_restT.user_tweets.aggregate( [{'$group': {'_id': '$user'}}]))), "users with tweets") # And finally we plot the lexical diversity: # In[38]: cursor=dbclient.db_restT.diversity.aggregate([ {'$project': { '_id': 0, 'name': '$name', 'lex_div': '$lexical_diversity'}}]) lex_div=DataFrame(list(cursor)) lex_div.hist('lex_div', bins=50) # 2.3: Track unfollows # -- # Write a python program to create a db called db_followers that stores all the followers for all the users that you find in task 2.1. Then, write a program to find the un-followed friends after a week for the top 10 users( users that have the highest number of followers in task 2.1) since the time that you extracted the tweets. In other words, you need to look for the people following the top 10 users at time X (the time that you extracted the tweets) and then look at the people following the same top 10 users at a later time Y (one-week after X) to see who stopped following the top 10 users. # First, make a db/table of top RT'ed users in which to store follower stats # In[39]: dbclient.drop_database('db_followers') rows=[] for row in dbclient.db_tweets.top_retweets.aggregate([ {'$group': { '_id': '$user.id',
def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] self.assertEqual(len(ax.patches), 2)
def test_hist(self): import matplotlib.pyplot as plt df = DataFrame(randn(100, 4)) _check_plot_works(df.hist) _check_plot_works(df.hist, grid=False) # make sure layout is handled df = DataFrame(randn(100, 3)) _check_plot_works(df.hist) axes = df.hist(grid=False) self.assert_(not axes[1, 1].get_visible()) df = DataFrame(randn(100, 1)) _check_plot_works(df.hist) # make sure layout is handled df = DataFrame(randn(100, 6)) _check_plot_works(df.hist) # make sure sharex, sharey is handled _check_plot_works(df.hist, sharex=True, sharey=True) # handle figsize arg _check_plot_works(df.hist, figsize=(8, 10)) # make sure xlabelsize and xrot are handled ser = df[0] xf, yf = 20, 20 xrot, yrot = 30, 30 ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot) xf, yf = 20, 20 xrot, yrot = 30, 30 axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) for i, ax in enumerate(axes.ravel()): if i < len(df.columns): ytick = ax.get_yticklabels()[0] xtick = ax.get_xticklabels()[0] self.assertAlmostEqual(ytick.get_fontsize(), yf) self.assertAlmostEqual(ytick.get_rotation(), yrot) self.assertAlmostEqual(xtick.get_fontsize(), xf) self.assertAlmostEqual(xtick.get_rotation(), xrot) tm.close() # make sure kwargs to hist are handled ax = ser.hist(normed=True, cumulative=True, bins=4) # height of last bin (index 5) must be 1.0 self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0) tm.close() ax = ser.hist(log=True) # scale of y must be 'log' self.assertEqual(ax.get_yscale(), 'log') tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.assertRaises(AttributeError): ser.hist(foo='bar')
X = series.values X = X.astype('float32') train_size = int(len(X) * 0.50) train, test = X[0:train_size], X[train_size:] # walk-forward validation history = [x for x in train] predictions = list() for i in range(len(test)): # difference data months_in_year = 12 diff = difference(history, months_in_year) # predict model = ARIMA(diff, order=(0, 0, 1)) model_fit = model.fit(trend='nc', disp=0) yhat = model_fit.forecast()[0] yhat = inverse_difference(history, yhat, months_in_year) predictions.append(yhat) # observation obs = test[i] history.append(obs) # errors residuals = [test[i] - predictions[i] for i in range(len(test))] residuals = DataFrame(residuals) print(residuals.describe()) # plot pyplot.figure() pyplot.subplot(211) residuals.hist(ax=pyplot.gca()) pyplot.subplot(212) residuals.plot(kind='kde', ax=pyplot.gca()) pyplot.show()
def test_hist_layout(self): df = DataFrame(np.random.randn(100, 2)) df[2] = to_datetime( np.random.randint( self.start_date_to_int64, self.end_date_to_int64, size=100, dtype=np.int64, )) layout_to_expected_size = ( { "layout": None, "expected_size": (2, 2) }, # default is 2x2 { "layout": (2, 2), "expected_size": (2, 2) }, { "layout": (4, 1), "expected_size": (4, 1) }, { "layout": (1, 4), "expected_size": (1, 4) }, { "layout": (3, 3), "expected_size": (3, 3) }, { "layout": (-1, 4), "expected_size": (1, 4) }, { "layout": (4, -1), "expected_size": (4, 1) }, { "layout": (-1, 2), "expected_size": (2, 2) }, { "layout": (2, -1), "expected_size": (2, 2) }, ) for layout_test in layout_to_expected_size: axes = df.hist(layout=layout_test["layout"]) expected = layout_test["expected_size"] self._check_axes_shape(axes, axes_num=3, layout=expected) # layout too small for all 4 plots msg = "Layout of 1x1 must be larger than required size 3" with pytest.raises(ValueError, match=msg): df.hist(layout=(1, 1)) # invalid format for layout msg = re.escape("Layout must be a tuple of (rows, columns)") with pytest.raises(ValueError, match=msg): df.hist(layout=(1, )) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): df.hist(layout=(-1, -1))
def legacy_func(): """Old function used for PAN18""" results = DataFrame() # experiment_names = ('N-grams', 'N-grams, LSA 300') # results[experiment_names[0]] = [0.8, 0.82222222, 0.80555556, 0.82222222, 0.82222222, 0.79444444, 0.87777778, 0.85, 0.81666667, 0.77777778] # results[experiment_names[1]] = [0.82777778, 0.84444444, 0.84444444, 0.83888889, 0.81111111, 0.8, 0.88888889, 0.82777778, 0.78333333, 0.81666667] # # # %%% TEMP: Testing # temp_list = [] # for item in results[experiment_names[0]]: # temp_list.append(item - 0.02) # results[experiment_names[0]] = temp_list # experiment_names = ('Word and char n-grams, No LSA (English)', 'Word n-grams, No LSA (English)') results[experiment_names[0]] = [ 0.8, 0.79444444, 0.82222222, 0.87222222, 0.76111111, 0.85, 0.82222222, 0.85555556, 0.81111111, 0.81111111 ] results[experiment_names[1]] = [ 0.8, 0.78333333, 0.81111111, 0.85, 0.77222222, 0.81666667, 0.80555556, 0.82777778, 0.8, 0.8 ] # experiment_names = ('Word and char n-grams, No LSA (Arabic)', 'Char n-grams, No LSA (Arabic)') # results[experiment_names[0]] = [0.83333333, 0.77777778, 0.81111111, 0.76666667, 0.81111111, 0.87777778, 0.81111111, 0.77777778, 0.81111111, 0.75555556] # results[experiment_names[1]] = [0.77777778, 0.75555556, 0.83333333, 0.74444444, 0.78888889, 0.82222222, 0.82222222, 0.75555556, 0.77777778, 0.71111111] # Descriptive stats print(results.describe()) # Box and whisker plot results.boxplot() plt.show() # Histogram plot results.hist() plt.show() # Normality test # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html alpha = 0.05 print("Confidence level = {}%".format((1 - alpha) * 100)) for experiment_name in experiment_names: statistic, p = normaltest(results[experiment_name]) # ↳ Null hypothesis: the sample comes from a normal distribution print( "{}: (skewtest z-score)^2 + (kurtosistest z-score)^2 = {}, p = {}". format(experiment_name, statistic, p)) if p < alpha: # The null hypothesis can be rejected print( "No: It is unlikely that the sample comes from a Gaussian (normal) distribution" ) else: print( "Yes: It is likely that the sample comes from a Gaussian (normal) distribution" ) # T-test: Compare means for Gaussian samples # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html alpha = 0.05 print("Confidence level = {}%".format((1 - alpha) * 100)) equal_variances = True statistic, p = ttest_ind(results[experiment_names[0]], results[experiment_names[1]], equal_var=equal_variances) # ↳ Null hypothesis: The two samples have identical average (expected) values. if equal_variances: statistic_name = "t-test statistic" else: statistic_name = "Welch’s t-test statistic" print("{} = {}, p = {}".format(statistic_name, statistic, p)) if p < alpha: # The null hypothesis can be rejected print( "Significant difference between the means: Samples are likely drawn from different distributions" ) else: print( "No significant difference between the means: Samples are likely drawn from the same distribution" )
def test_hist_df_legacy(self): from matplotlib.patches import Rectangle with tm.assert_produces_warning(UserWarning): _check_plot_works(self.hist_df.hist) # make sure layout is handled df = DataFrame(np.random.randn(100, 2)) df[2] = to_datetime( np.random.randint( self.start_date_to_int64, self.end_date_to_int64, size=100, dtype=np.int64, )) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(df.hist, grid=False) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) assert not axes[1, 1].get_visible() _check_plot_works(df[[2]].hist) df = DataFrame(np.random.randn(100, 1)) _check_plot_works(df.hist) # make sure layout is handled df = DataFrame(np.random.randn(100, 5)) df[5] = to_datetime( np.random.randint( self.start_date_to_int64, self.end_date_to_int64, size=100, dtype=np.int64, )) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(df.hist, layout=(4, 2)) self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) # make sure sharex, sharey is handled with tm.assert_produces_warning(UserWarning): _check_plot_works(df.hist, sharex=True, sharey=True) # handle figsize arg with tm.assert_produces_warning(UserWarning): _check_plot_works(df.hist, figsize=(8, 10)) # check bins argument with tm.assert_produces_warning(UserWarning): _check_plot_works(df.hist, bins=5) # make sure xlabelsize and xrot are handled ser = df[0] xf, yf = 20, 18 xrot, yrot = 30, 40 axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) xf, yf = 20, 18 xrot, yrot = 30, 40 axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) tm.close() ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) tm.close() ax = ser.hist(log=True) # scale of y must be 'log' self._check_ax_scales(ax, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.external_error_raised(AttributeError): ser.hist(foo="bar")
print(df.corr()) ### 8. Merge and Join ### print(df) other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]}) print(other) print(pd.merge(df, other, on="str_col", how="inner")) print(pd.merge(df, other, on="str_col", how="outer")) print(pd.merge(df, other, on="str_col", how="left")) print(pd.merge(df, other, on="str_col", how="right")) ### 9. Plot ### plot_df = DataFrame(np.random.randn(1000, 2), columns=["x", "y"]) plot_df["y"] = plot_df["y"].map(lambda x: x + 1) plot_df.plot() ### plot not working???? ### plot_df.hist() ### plot not working???? ### ### 10. Scikit-learn conversion ### print(df) print(df.values[:, :-1]) # print(df.values[:,:-1].astype(float32)) not working? input()
def _save(self, df: pd.DataFrame) -> None: save_args = self._save_args savefig_args = save_args.pop("savefig_args", {}) df.hist(**save_args) plt.savefig(fname=self._filepath, **savefig_args)
print df print "añadimos columnas combinando las actuales" df["C"] = df["A"]+df["B"] df["D"] = df["A"]*3 df["E"] = np.sqrt(df["A"]) print df print "*"*15 print "Datos disponibles de un dataframe" print " descripcion del dataframe" print df.describe() print " covarianza " print df.cov() print " correlación " print df.corr() print "*"*15 print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas " print " DataFrame(np.random.randn(1000,2),columns=['x','y'])" plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y']) print plot_df print "Mostramos las graficas" plot_df.plot() plot_df.hist()
df.loc[l[i], 'Growth'] = (df.loc[l[i-1], 'Growth'] + trade_value) * (1 + df.loc[l[i], 'IVWReturn']) df.loc[l[i], 'Value'] = (df.loc[l[i-1], 'Value'] - trade_value) * (1 + df.loc[l[i], 'IVEReturn']) df.loc[l[i], 'InvestmentTotal'] = df.loc[l[i], 'Value'] + df.loc[l[i], 'Growth'] df.loc[l[i], 'Growth%'] = df.loc[l[i],'Growth']/df.loc[l[i],'InvestmentTotal'] df.loc[l[i], 'Value%'] = df.loc[l[i],'Value']/df.loc[l[i], 'InvestmentTotal'] df.loc[l[i], 'Total%'] = df.loc[l[i], 'Growth%'] + df.loc[l[i], 'Value%'] final_value = df.loc[l[len(l)-1], 'InvestmentTotal'] - df.loc[l[len(l)-1],'SP500Total'] if math.isnan(final_value) == True: print("result removed it was nan") test = test + 1 else: results.append(final_value) print(results) test = test - 1 dg = pd.Series(results, name = 'Results') dff = DataFrame(dg) print(dff) dff.hist() plt.show() file = ExcelWriter('ValueGrowth.xlsx') df.to_excel(file, 'Data') file.close() os.startfile('ValueGrowth.xlsx') df.plot(y = ['SP500Total', 'InvestmentTotal']) plt.show()
# In[12]: Y=DataFrame(Y) # In[13]: Y.head(1) ### Univariate analysis # In[14]: X.hist() ##### These histograms depicts the distribution of the 4 independent variables ##### We can do this analysis using just 1 variable also # In[15]: X[0].hist() ##### we can also get the stats of that variable # In[16]:
model = ARIMA(diff, order=(6,1,2)) model_fit = model.fit(trend='nc', disp=0) yhat = model_fit.forecast()[0] yhat = inverse_difference(history, yhat, days_in_month) predictions.append(yhat) # observation obs = test[i] history.append(obs) # errors residuals = [test[i]-predictions[i] for i in range(len(test))] residuals = DataFrame(residuals) print(residuals.describe()) # plot plt.figure(figsize=(36, 36)) plt.subplot(211) residuals.hist(ax=plt.gca()) plt.subplot(212) residuals.plot(kind='kde', ax=plt.gca()) plt.show() # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return diff
def show_histogram(df: pd.DataFrame, column): df.hist(column=column) plt.show()
def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame(np.random.randint(1, 10, size=(100, 2)), columns=["a", "b"]) ax = df.hist(by="a", histtype=histtype) self._check_patches_all_filled(ax, filled=expected)
def decile_for_each(df: pd.DataFrame, columns_for_show: list, decile: int): df.hist(column=columns_for_show, bins=50, figsize=(20, 20))
def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle from pandas.plotting._matplotlib.hist import _grouped_hist df = DataFrame(np.random.randn(500, 1), columns=["A"]) df["B"] = to_datetime( np.random.randint( self.start_date_to_int64, self.end_date_to_int64, size=500, dtype=np.int64, )) df["C"] = np.random.randint(0, 4, 500) df["D"] = ["X"] * 500 axes = _grouped_hist(df.A, by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() axes = df.hist(by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) tm.close() # group by a key with single value axes = df.hist(by="D", rot=30) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) self._check_ticks_props(axes, xrot=30) tm.close() # make sure kwargs to hist are handled xf, yf = 20, 18 xrot, yrot = 30, 40 axes = _grouped_hist( df.A, by=df.C, cumulative=True, bins=4, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot, density=True, ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) tm.close() axes = _grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' self._check_ax_scales(axes, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with tm.external_error_raised(AttributeError): _grouped_hist(df.A, by=df.C, foo="bar") msg = "Specify figure size by tuple instead" with pytest.raises(ValueError, match=msg): df.hist(by="C", figsize="default")
}) #stats df1.describe() #only shows numbers #gh.ix[:,['float_col', 'int_col']] less elegant df1[['float_col', 'int_col']] df1.fillna(value="waiting") df1['div_col'] = df1['float_col'] / df1['int_col'] mean = df1['rev_col'].mean() df1['mean_col'] = mean new = pd.merge(df1, df2, how='outer', on='str_col') #quick plotting import numpy as np plot_df = DataFrame(np.random.randn(1000, 2), columns=['x', 'y']) plot_df.hist() plot_df.plot() #series object 1-dimensional days = ['mon', 'tues', 'weds', 'thurs', 'fri', 'sat', 'sun'] ratings = ['meh', 'erg', 'ugh', 'ok', 'alright', 'yauh', "d'oh"] s1 = Series(days, ratings, name="what days are") #cool to make data with date_range s2 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
# Residual Plot # # We would expect the plot to be random around the value of 0 and not show any trend or cyclic structure. # we are interested in the mean value of the residual errors. A value close to zero suggests no bias in the forecasts, whereas positive and negative values suggest a positive or negative bias in the forecasts made. # In[58]: # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() plt.show() residuals.plot(kind='kde') plt.show() print(residuals.describe()) # histogram plot residuals.hist() plt.show() # Residual Statistics shows a mean error value close to zero(0.06), but perhaps not close enough. # In[59]: autocorrelation_plot(residuals) plt.show() # # AUTO ARIMAX # In[131]: split = len( ts_df_key) * 0.80 #split data in test and train 20% and 80% respectively.