Beispiel #1
0
    def test_axis_shared(self):
        # GH4089
        import matplotlib.pyplot as plt

        def tick_text(tl):
            return [x.get_text() for x in tl]

        n = 100
        df = DataFrame(
            {
                "gender": np.array(["Male", "Female"])[random.randint(2, size=n)],
                "height": random.normal(66, 4, size=n),
                "weight": random.normal(161, 32, size=n),
            }
        )
        ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True)
        self.assert_(ax1._shared_x_axes.joined(ax1, ax2))
        self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_x_axes.joined(ax1, ax2))
        self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2))
        plt.close("all")

        ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True)
        self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax1._shared_y_axes.joined(ax1, ax2))
        self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
        plt.close("all")

        ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True)
        self.assert_(ax1._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax1._shared_y_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
Beispiel #2
0
    def test_axis_shared(self):
        # GH4089
        import matplotlib.pyplot as plt
        def tick_text(tl):
            return [x.get_text() for x in tl]

        n = 100
        df = DataFrame({'gender': np.array(['Male', 'Female'])[random.randint(2, size=n)],
                        'height': random.normal(66, 4, size=n),
                        'weight': random.normal(161, 32, size=n)})
        ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True)
        self.assert_(ax1._shared_x_axes.joined(ax1, ax2))
        self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_x_axes.joined(ax1, ax2))
        self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2))
        plt.close('all')

        ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True)
        self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax1._shared_y_axes.joined(ax1, ax2))
        self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
        plt.close('all')

        ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True,
                           sharey=True)
        self.assert_(ax1._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax1._shared_y_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_x_axes.joined(ax1, ax2))
        self.assert_(ax2._shared_y_axes.joined(ax1, ax2))
Beispiel #3
0
 def test_grouped_hist_layout(self):
     import matplotlib.pyplot as plt
     n = 100
     df = DataFrame({'gender': np.array(['Male',
                                         'Female'])[random.randint(2,
                                                                   size=n)],
                     'height': random.normal(66, 4, size=n),
                     'weight': random.normal(161, 32, size=n),
                     'category': random.randint(4, size=n)})
     self.assertRaises(ValueError, df.hist, column='weight', by=df.gender,
                       layout=(1, 1))
     self.assertRaises(ValueError, df.hist, column='weight', by=df.gender,
                       layout=(1,))
     self.assertRaises(ValueError, df.hist, column='height', by=df.category,
                       layout=(1, 3))
     self.assertRaises(ValueError, df.hist, column='height', by=df.category,
                       layout=(2, 1))
     self.assertEqual(df.hist(column='height', by=df.gender,
                              layout=(2, 1)).shape, (2,))
     plt.close('all')
     self.assertEqual(df.hist(column='height', by=df.category,
                              layout=(4, 1)).shape, (4,))
     plt.close('all')
     self.assertEqual(df.hist(column='height', by=df.category,
                              layout=(4, 2)).shape, (4, 2))
Beispiel #4
0
    def test_hist(self):
        import matplotlib.pyplot as plt
        df = DataFrame(np.random.randn(100, 4))
        _check_plot_works(df.hist)
        _check_plot_works(df.hist, grid=False)

        # make sure layout is handled
        df = DataFrame(np.random.randn(100, 3))
        _check_plot_works(df.hist)
        axes = df.hist(grid=False)
        self.assert_(not axes[1, 1].get_visible())

        df = DataFrame(np.random.randn(100, 1))
        _check_plot_works(df.hist)

        # make sure layout is handled
        df = DataFrame(np.random.randn(100, 6))
        _check_plot_works(df.hist)

        # make sure sharex, sharey is handled
        _check_plot_works(df.hist, sharex=True, sharey=True)

        # make sure xlabelsize and xrot are handled
        ser = df[0]
        xf, yf = 20, 20
        xrot, yrot = 30, 30
        ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        ytick = ax.get_yticklabels()[0]
        xtick = ax.get_xticklabels()[0]
        self.assertAlmostEqual(ytick.get_fontsize(), yf)
        self.assertAlmostEqual(ytick.get_rotation(), yrot)
        self.assertAlmostEqual(xtick.get_fontsize(), xf)
        self.assertAlmostEqual(xtick.get_rotation(), xrot)

        xf, yf = 20, 20
        xrot, yrot = 30, 30
        axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        for i, ax in enumerate(axes.ravel()):
            if i < len(df.columns):
                ytick = ax.get_yticklabels()[0]
                xtick = ax.get_xticklabels()[0]
                self.assertAlmostEqual(ytick.get_fontsize(), yf)
                self.assertAlmostEqual(ytick.get_rotation(), yrot)
                self.assertAlmostEqual(xtick.get_fontsize(), xf)
                self.assertAlmostEqual(xtick.get_rotation(), xrot)

        plt.close('all')
        # make sure kwargs to hist are handled
        ax = ser.hist(normed=True, cumulative=True, bins=4)
        # height of last bin (index 5) must be 1.0
        self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0)

        plt.close('all')
        ax = ser.hist(log=True)
        # scale of y must be 'log'
        self.assert_(ax.get_yscale() == 'log')

        plt.close('all')
        # propagate attr exception from matplotlib.Axes.hist
        self.assertRaises(AttributeError, ser.hist, foo='bar')
Beispiel #5
0
    def test_hist_layout(self):
        import matplotlib.pyplot as plt

        df = DataFrame(randn(100, 4))

        layout_to_expected_size = (
            {"layout": None, "expected_size": (2, 2)},  # default is 2x2
            {"layout": (2, 2), "expected_size": (2, 2)},
            {"layout": (4, 1), "expected_size": (4, 1)},
            {"layout": (1, 4), "expected_size": (1, 4)},
            {"layout": (3, 3), "expected_size": (3, 3)},
        )

        for layout_test in layout_to_expected_size:
            ax = df.hist(layout=layout_test["layout"])
            self.assertEqual(len(ax), layout_test["expected_size"][0])
            self.assertEqual(len(ax[0]), layout_test["expected_size"][1])

        # layout too small for all 4 plots
        with tm.assertRaises(ValueError):
            df.hist(layout=(1, 1))

        # invalid format for layout
        with tm.assertRaises(ValueError):
            df.hist(layout=(1,))
    def test_grouped_hist_legacy(self):
        from matplotlib.patches import Rectangle

        df = DataFrame(randn(500, 2), columns=['A', 'B'])
        df['C'] = np.random.randint(0, 4, 500)
        df['D'] = ['X'] * 500

        axes = grouped_hist(df.A, by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        axes = df.hist(by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        # group by a key with single value
        axes = df.hist(by='D', rot=30)
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        self._check_ticks_props(axes, xrot=30)

        tm.close()
        # make sure kwargs to hist are handled
        xf, yf = 20, 18
        xrot, yrot = 30, 40

        if _mpl_ge_2_2_0():
            kwargs = {"density": True}
        else:
            kwargs = {"normed": True}

        axes = grouped_hist(df.A, by=df.C, cumulative=True,
                            bins=4, xlabelsize=xf, xrot=xrot,
                            ylabelsize=yf, yrot=yrot, **kwargs)
        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
            height = rects[-1].get_height()
            tm.assert_almost_equal(height, 1.0)
        self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
                                ylabelsize=yf, yrot=yrot)

        tm.close()
        axes = grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        self._check_ax_scales(axes, yaxis='log')

        tm.close()
        # propagate attr exception from matplotlib.Axes.hist
        with pytest.raises(AttributeError):
            grouped_hist(df.A, by=df.C, foo='bar')

        with tm.assert_produces_warning(FutureWarning):
            df.hist(by='C', figsize='default')
Beispiel #7
0
    def test_grouped_hist(self):
        import matplotlib.pyplot as plt

        df = DataFrame(randn(500, 2), columns=["A", "B"])
        df["C"] = np.random.randint(0, 4, 500)
        axes = plotting.grouped_hist(df.A, by=df.C)
        self.assertEqual(len(axes.ravel()), 4)

        tm.close()
        axes = df.hist(by=df.C)
        self.assertEqual(axes.ndim, 2)
        self.assertEqual(len(axes.ravel()), 4)

        for ax in axes.ravel():
            self.assert_(len(ax.patches) > 0)

        tm.close()
        # make sure kwargs to hist are handled
        axes = plotting.grouped_hist(df.A, by=df.C, normed=True, cumulative=True, bins=4)

        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            height = ax.get_children()[5].get_height()
            self.assertAlmostEqual(height, 1.0)

        tm.close()
        axes = plotting.grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        for ax in axes.ravel():
            self.assertEqual(ax.get_yscale(), "log")

        tm.close()
        # propagate attr exception from matplotlib.Axes.hist
        with tm.assertRaises(AttributeError):
            plotting.grouped_hist(df.A, by=df.C, foo="bar")
Beispiel #8
0
    def test_grouped_hist(self):
        import matplotlib.pyplot as plt
        df = DataFrame(np.random.randn(500, 2), columns=['A', 'B'])
        df['C'] = np.random.randint(0, 4, 500)
        axes = plotting.grouped_hist(df.A, by=df.C)
        self.assert_(len(axes.ravel()) == 4)

        plt.close('all')
        axes = df.hist(by=df.C)
        self.assert_(axes.ndim == 2)
        self.assert_(len(axes.ravel()) == 4)

        for ax in axes.ravel():
            self.assert_(len(ax.patches) > 0)

        plt.close('all')
        # make sure kwargs to hist are handled
        axes = plotting.grouped_hist(df.A, by=df.C, normed=True,
                                     cumulative=True, bins=4)

        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            height = ax.get_children()[5].get_height()
            self.assertAlmostEqual(height, 1.0)

        plt.close('all')
        axes = plotting.grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        for ax in axes.ravel():
            self.assert_(ax.get_yscale() == 'log')

        plt.close('all')
        # propagate attr exception from matplotlib.Axes.hist
        self.assertRaises(AttributeError, plotting.grouped_hist, df.A,
                          by=df.C, foo='bar')
    def test_hist_df_legacy(self):
        from matplotlib.patches import Rectangle
        _check_plot_works(self.hist_df.hist)

        # make sure layout is handled
        df = DataFrame(randn(100, 3))
        axes = _check_plot_works(df.hist, grid=False)
        self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
        self.assertFalse(axes[1, 1].get_visible())

        df = DataFrame(randn(100, 1))
        _check_plot_works(df.hist)

        # make sure layout is handled
        df = DataFrame(randn(100, 6))
        axes = _check_plot_works(df.hist, layout=(4, 2))
        self._check_axes_shape(axes, axes_num=6, layout=(4, 2))

        # make sure sharex, sharey is handled
        _check_plot_works(df.hist, sharex=True, sharey=True)

        # handle figsize arg
        _check_plot_works(df.hist, figsize=(8, 10))

        # check bins argument
        _check_plot_works(df.hist, bins=5)

        # make sure xlabelsize and xrot are handled
        ser = df[0]
        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
                                ylabelsize=yf, yrot=yrot)

        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
                                ylabelsize=yf, yrot=yrot)

        tm.close()
        # make sure kwargs to hist are handled
        ax = ser.hist(normed=True, cumulative=True, bins=4)
        # height of last bin (index 5) must be 1.0
        rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
        self.assertAlmostEqual(rects[-1].get_height(), 1.0)

        tm.close()
        ax = ser.hist(log=True)
        # scale of y must be 'log'
        self._check_ax_scales(ax, yaxis='log')

        tm.close()

        # propagate attr exception from matplotlib.Axes.hist
        with tm.assertRaises(AttributeError):
            ser.hist(foo='bar')
    def test_hist_layout(self):
        df = DataFrame(randn(100, 3))

        layout_to_expected_size = (
            {'layout': None, 'expected_size': (2, 2)},  # default is 2x2
            {'layout': (2, 2), 'expected_size': (2, 2)},
            {'layout': (4, 1), 'expected_size': (4, 1)},
            {'layout': (1, 4), 'expected_size': (1, 4)},
            {'layout': (3, 3), 'expected_size': (3, 3)},
            {'layout': (-1, 4), 'expected_size': (1, 4)},
            {'layout': (4, -1), 'expected_size': (4, 1)},
            {'layout': (-1, 2), 'expected_size': (2, 2)},
            {'layout': (2, -1), 'expected_size': (2, 2)}
        )

        for layout_test in layout_to_expected_size:
            axes = df.hist(layout=layout_test['layout'])
            expected = layout_test['expected_size']
            self._check_axes_shape(axes, axes_num=3, layout=expected)

        # layout too small for all 4 plots
        with tm.assertRaises(ValueError):
            df.hist(layout=(1, 1))

        # invalid format for layout
        with tm.assertRaises(ValueError):
            df.hist(layout=(1,))
        with tm.assertRaises(ValueError):
            df.hist(layout=(-1, -1))
    def test_grouped_hist_legacy(self):
        df = DataFrame(randn(500, 2), columns=['A', 'B'])
        df['C'] = np.random.randint(0, 4, 500)
        df['D'] = ['X'] * 500

        axes = plotting.grouped_hist(df.A, by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        axes = df.hist(by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        # group by a key with single value
        axes = df.hist(by='D', rot=30)
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        self._check_ticks_props(axes, xrot=30)

        tm.close()
        # make sure kwargs to hist are handled
        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = plotting.grouped_hist(df.A, by=df.C, normed=True,
                                     cumulative=True, bins=4,
                                     xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            height = ax.get_children()[5].get_height()
            self.assertAlmostEqual(height, 1.0)
        self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot,
                                ylabelsize=yf, yrot=yrot)

        tm.close()
        axes = plotting.grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        self._check_ax_scales(axes, yaxis='log')

        tm.close()
        # propagate attr exception from matplotlib.Axes.hist
        with tm.assertRaises(AttributeError):
            plotting.grouped_hist(df.A, by=df.C, foo='bar')

        with tm.assert_produces_warning(FutureWarning):
            df.hist(by='C', figsize='default')
Beispiel #12
0
    def test_hist(self):
        df = DataFrame(np.random.randn(100, 4))
        _check_plot_works(df.hist)
        _check_plot_works(df.hist, grid=False)

        #make sure layout is handled
        df = DataFrame(np.random.randn(100, 3))
        _check_plot_works(df.hist)
        axes = df.hist(grid=False)
        self.assert_(not axes[1, 1].get_visible())

        df = DataFrame(np.random.randn(100, 1))
        _check_plot_works(df.hist)

        #make sure layout is handled
        df = DataFrame(np.random.randn(100, 6))
        _check_plot_works(df.hist)

        #make sure sharex, sharey is handled
        _check_plot_works(df.hist, sharex=True, sharey=True)

        #make sure kwargs are handled
        ser = df[0]
        xf, yf = 20, 20
        xrot, yrot = 30, 30
        ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        ytick = ax.get_yticklabels()[0]
        xtick = ax.get_xticklabels()[0]
        self.assertAlmostEqual(ytick.get_fontsize(), yf)
        self.assertAlmostEqual(ytick.get_rotation(), yrot)
        self.assertAlmostEqual(xtick.get_fontsize(), xf)
        self.assertAlmostEqual(xtick.get_rotation(), xrot)

        xf, yf = 20, 20
        xrot, yrot = 30, 30
        axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        for i, ax in enumerate(axes.ravel()):
            if i < len(df.columns):
                ytick = ax.get_yticklabels()[0]
                xtick = ax.get_xticklabels()[0]
                self.assertAlmostEqual(ytick.get_fontsize(), yf)
                self.assertAlmostEqual(ytick.get_rotation(), yrot)
                self.assertAlmostEqual(xtick.get_fontsize(), xf)
                self.assertAlmostEqual(xtick.get_rotation(), xrot)
Beispiel #13
0
    def test_grouped_hist_layout(self):
        import matplotlib.pyplot as plt

        n = 100
        df = DataFrame(
            {
                "gender": np.array(["Male", "Female"])[random.randint(2, size=n)],
                "height": random.normal(66, 4, size=n),
                "weight": random.normal(161, 32, size=n),
                "category": random.randint(4, size=n),
            }
        )
        self.assertRaises(ValueError, df.hist, column="weight", by=df.gender, layout=(1, 1))
        self.assertRaises(ValueError, df.hist, column="weight", by=df.gender, layout=(1,))
        self.assertRaises(ValueError, df.hist, column="height", by=df.category, layout=(1, 3))
        self.assertRaises(ValueError, df.hist, column="height", by=df.category, layout=(2, 1))
        self.assertEqual(df.hist(column="height", by=df.gender, layout=(2, 1)).shape, (2,))
        plt.close("all")
        self.assertEqual(df.hist(column="height", by=df.category, layout=(4, 1)).shape, (4,))
        plt.close("all")
        self.assertEqual(df.hist(column="height", by=df.category, layout=(4, 2)).shape, (4, 2))
Beispiel #14
0
    def test_axis_share_xy(self):
        n = 100
        df = DataFrame({'gender': tm.choice(['Male', 'Female'], size=n),
                        'height': random.normal(66, 4, size=n),
                        'weight': random.normal(161, 32, size=n)})
        ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True,
                           sharey=True)

        # share both x and y
        self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2))
        self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2))

        self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2))
        self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2))
    def test_grouped_hist(self):
        import matplotlib.pyplot as plt
        df = DataFrame(np.random.randn(500, 2), columns=['A', 'B'])
        df['C'] = np.random.randint(0, 4, 500)
        axes = plotting.grouped_hist(df.A, by=df.C)
        self.assert_(len(axes.ravel()) == 4)

        plt.close('all')
        axes = df.hist(by=df.C)
        self.assert_(axes.ndim == 2)
        self.assert_(len(axes.ravel()) == 4)

        for ax in axes.ravel():
            self.assert_(len(ax.patches) > 0)
Beispiel #16
0
def gs(str,list):
    s = list
    t= pd.read_csv(str,usecols= s)

    w=DataFrame(t)



    try:
         plt.scatter(w[s[0]],w[s[1]],color='red')

         plt.show()
    except:
        pass
    try:
        w.hist()
        plt.show()

        w.plot(kind='box',by=list)
        plt.show()
    except:
        pass

    t=w.applymap(np.isreal)
    print t

    b= ''.join(s)
    for i in t[b]:
        if i==False:

            a=(w[b].value_counts())


            a.plot(kind='bar')

            plt.show()
            break
Beispiel #17
0
    def test_axis_share_xy(self):
        n = 100
        df = DataFrame(
            {
                "gender": tm.choice(["Male", "Female"], size=n),
                "height": random.normal(66, 4, size=n),
                "weight": random.normal(161, 32, size=n),
            }
        )
        ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True)

        # share both x and y
        self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2))
        self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2))

        self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2))
        self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2))
Beispiel #18
0
    def test_hist_layout(self):
        import matplotlib.pyplot as plt
        plt.close('all')
        df = DataFrame(randn(100, 4))

        layout_to_expected_size = (
            {'layout': None, 'expected_size': (2, 2)},  # default is 2x2
            {'layout': (2, 2), 'expected_size': (2, 2)},
            {'layout': (4, 1), 'expected_size': (4, 1)},
            {'layout': (1, 4), 'expected_size': (1, 4)},
            {'layout': (3, 3), 'expected_size': (3, 3)},
        )

        for layout_test in layout_to_expected_size:
            ax = df.hist(layout=layout_test['layout'])
            self.assert_(len(ax) == layout_test['expected_size'][0])
            self.assert_(len(ax[0]) == layout_test['expected_size'][1])

        # layout too small for all 4 plots
        self.assertRaises(ValueError, df.hist, layout=(1, 1))

        # invalid format for layout
        self.assertRaises(ValueError, df.hist, layout=(1,))
Beispiel #19
0
    def test_hist_layout(self):
        import matplotlib.pyplot as plt

        plt.close("all")
        df = DataFrame(np.random.randn(100, 4))

        layout_to_expected_size = (
            {"layout": None, "expected_size": (2, 2)},  # default is 2x2
            {"layout": (2, 2), "expected_size": (2, 2)},
            {"layout": (4, 1), "expected_size": (4, 1)},
            {"layout": (1, 4), "expected_size": (1, 4)},
            {"layout": (3, 3), "expected_size": (3, 3)},
        )

        for layout_test in layout_to_expected_size:
            ax = df.hist(layout=layout_test["layout"])
            self.assert_(len(ax) == layout_test["expected_size"][0])
            self.assert_(len(ax[0]) == layout_test["expected_size"][1])

        # layout too small for all 4 plots
        self.assertRaises(ValueError, df.hist, layout=(1, 1))

        # invalid format for layout
        self.assertRaises(ValueError, df.hist, layout=(1,))
Beispiel #20
0
'''

import pandas
from pandas import Series, DataFrame
import code
import numpy as np
import matplotlib.pyplot as plt
import sys
import csv

if __name__ == '__main__':
	with open("stats.csv" if len(sys.argv) < 2 else sys.argv[1]) as f:
		reader = csv.reader(f)
		data = [(float(peak), float(iqr)) for peak, iqr in reader]

	md = Series(zip(*data)[0])
	iqrd = Series(zip(*data)[1])

	df = DataFrame(data=dict(max_deltas=md, iqr_deltas=iqrd))

	#log_df = np.log(df) / np.log(2)
	#log_df.columns = ["lg {}".format(foo) for foo in log_df.columns]
	#log_df.hist(normed=True)

	df.hist(normed=True)

	plt.show()

	code.interact(local=locals())

Beispiel #21
0
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({'str_col': ['a', 'b'], 'some_val': [1, 2]})
print(other)
print(pd.merge(df, other, on='str_col', how='inner'))
print(pd.merge(df, other, on='str_col', how='outer'))
print(pd.merge(df, other, on='str_col', how='left'))
print(pd.merge(df, other, on='str_col', how='right'))

### 9. Plot ###

plot_df = DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])
plot_df['y'] = plot_df['y'].map(lambda x: x + 1)

plot_df.plot()  ### plot not working???? ###

plot_df.hist()  ### plot not working???? ###

### 10. Scikit-learn conversion ###

print(df)

print(df.values[:, :-1])

#print(df.values[:,:-1].astype(float32)) not working?

input()
Beispiel #22
0
                }
            }]))), "users with tweets")

# And finally we plot the lexical diversity:

# In[38]:

cursor = dbclient.db_restT.diversity.aggregate([{
    '$project': {
        '_id': 0,
        'name': '$name',
        'lex_div': '$lexical_diversity'
    }
}])
lex_div = DataFrame(list(cursor))
lex_div.hist('lex_div', bins=50)

# 2.3: Track unfollows
# --
# Write a python program to create a db called db_followers that stores all the followers for all the users that you find in task 2.1. Then, write a program to find the un-followed friends after a week for the top 10 users( users that have the highest number of followers in task 2.1) since the time that you extracted the tweets. In other words, you need to look for the people following the top 10 users at time X (the time that you extracted the tweets) and then look at the people following the same top 10 users at a later time Y (one-week after X) to see who stopped following the top 10 users.

# First, make a db/table of top RT'ed users in which to store follower stats

# In[39]:

dbclient.drop_database('db_followers')
rows = []
for row in dbclient.db_tweets.top_retweets.aggregate([{
        '$group': {
            '_id': '$user.id',
            'name': {
Beispiel #23
0
    def test_hist_layout(self):
        df = DataFrame(np.random.randn(100, 2))
        df[2] = to_datetime(
            np.random.randint(
                self.start_date_to_int64,
                self.end_date_to_int64,
                size=100,
                dtype=np.int64,
            ))

        layout_to_expected_size = (
            {
                "layout": None,
                "expected_size": (2, 2)
            },  # default is 2x2
            {
                "layout": (2, 2),
                "expected_size": (2, 2)
            },
            {
                "layout": (4, 1),
                "expected_size": (4, 1)
            },
            {
                "layout": (1, 4),
                "expected_size": (1, 4)
            },
            {
                "layout": (3, 3),
                "expected_size": (3, 3)
            },
            {
                "layout": (-1, 4),
                "expected_size": (1, 4)
            },
            {
                "layout": (4, -1),
                "expected_size": (4, 1)
            },
            {
                "layout": (-1, 2),
                "expected_size": (2, 2)
            },
            {
                "layout": (2, -1),
                "expected_size": (2, 2)
            },
        )

        for layout_test in layout_to_expected_size:
            axes = df.hist(layout=layout_test["layout"])
            expected = layout_test["expected_size"]
            self._check_axes_shape(axes, axes_num=3, layout=expected)

        # layout too small for all 4 plots
        with pytest.raises(ValueError):
            df.hist(layout=(1, 1))

        # invalid format for layout
        with pytest.raises(ValueError):
            df.hist(layout=(1, ))
        with pytest.raises(ValueError):
            df.hist(layout=(-1, -1))
Beispiel #24
0
#plt.title("Principal Component Analysis")
#plt.savefig("PCA_July24_2_components")

#plt.show()

DS = DataFrame({'good': X_pca[:, 0], 'bad': X_pca[:, 1]})
#DS.plot.scatter(DS['good'], DS['bad'])
#plt.scatter(X_pca[:,0], X_pca[:,1], y_train)
#plt.show()
#print DS

#from pandas.plotting import scatter_matrix

## df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd'])

DS.hist(grid=True, bins=1000)
plt.title(" Histogram of Good v. Bad ")
plt.show()

#s = cPickle.dumps(cls)
#sw = open('decisionTree_pickle','w')
#sw.write(s)

#print 'completed saving pickle file of Decision Tree model  '

###now to PCA anaysis...
###

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
def plot(prefix, header, row):
    x = {h: d for (h, d) in zip(header, row)}
    jobid = x["jobid"]
    del x["jobid"]
    result = []
    for k in x:
        timeseries = x[k].split(":")
        timeseries = [float(x) for x in timeseries]
        if sum(timeseries) == 0:
            continue
        timeseries = [[k, x, s]
                      for (s, x) in zip(timeseries, range(0, len(timeseries)))]
        result.extend(timeseries)

    if len(result) == 0:
        print("Empty job! Cannot plot!")
        return

    data = DataFrame(result, columns=["metrics", "segment", "value"])
    groups = data.groupby(["metrics"])
    metrics = DataFrame()
    labels = []
    colors = []
    style = []
    for name, group in groups:
        style.append(linestyleMap[name] + markerMap[name])
        colors.append(colorMap[name])
        if name == "md_file_delete":
            name = "file_delete"
        if name == "md_file_create":
            name = "file_create"
        try:
            metrics[name] = pd.Series([x[2] for x in group.values])
        except:
            print("Error processing %s with" % jobid)
            print(group.values)
            return

        labels.append(name)

    fsize = (8, 1 + 1.1 * len(labels))
    fsizeFixed = (8, 2)
    fsizeHist = (8, 6.5)

    pyplot.close('all')

    if len(labels) < 4:
        ax = metrics.plot(legend=True,
                          sharex=True,
                          grid=True,
                          sharey=True,
                          markersize=10,
                          figsize=fsizeFixed,
                          color=colors,
                          style=style)
        ax.set_ylabel("Value")
    else:
        ax = metrics.plot(subplots=True,
                          legend=False,
                          sharex=True,
                          grid=True,
                          sharey=True,
                          markersize=10,
                          figsize=fsize,
                          color=colors,
                          style=style)
        for (i, l) in zip(range(0, len(labels)), labels):
            ax[i].set_ylabel(l)

    pyplot.xlabel("Segment number")
    pyplot.savefig(prefix + "timeseries" + jobid + fileformat,
                   bbox_inches='tight',
                   dpi=150)

    # Create a facetted grid
    #g = sns.FacetGrid(tips, col="time", margin_titles=True)
    #bins = np.linspace(0, 60, 13)
    #g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
    ax = metrics.hist(grid=True,
                      sharey=True,
                      figsize=fsizeHist,
                      bins=15,
                      range=(0, 15))
    pyplot.xlim(0, 15)
    pyplot.savefig(prefix + "hist" + jobid + fileformat,
                   bbox_inches='tight',
                   dpi=150)

    # Plot first 30 segments
    if len(timeseries) <= 50:
        return

    if len(labels) < 4:
        ax = metrics.plot(legend=True,
                          xlim=(0, 30),
                          sharex=True,
                          grid=True,
                          sharey=True,
                          markersize=10,
                          figsize=fsizeFixed,
                          color=colors,
                          style=style)
        ax.set_ylabel("Value")
    else:
        ax = metrics.plot(subplots=True,
                          xlim=(0, 30),
                          legend=False,
                          sharex=True,
                          grid=True,
                          sharey=True,
                          markersize=10,
                          figsize=fsize,
                          color=colors,
                          style=style)
        for (i, l) in zip(range(0, len(labels)), labels):
            ax[i].set_ylabel(l)

    pyplot.xlabel("Segment number")
    pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat,
                   bbox_inches='tight',
                   dpi=150)
Beispiel #26
0
plt.plot(N,
         ret_acum(ret_medio_ibov_sim.iloc[ini:fim].values),
         label="ibovespa")
plt.title("LSTM Deep Learning: Portfolio Short Ibovespa")
plt.xlabel("Dia")
plt.ylabel("Retorno Acumulado")
plt.legend(frameon=False)

# scatter plot prob vs. return
line = plt.figure()
x = corr_prob_ret_sim.iloc[:, 0].values
y = corr_prob_ret_sim.iloc[:, 1].values
plt.plot(x, y, "o")

# distribuição retornos e probabilidades
corr_prob_ret_sim.hist(bins=100, grid=False, sharey=True)

#pd.set_option('display.width', 100)
#pd.set_option('precision', 6)
#correlations = ret_medio_port_sim.corr(method='pearson')
#correlations2 = corr_prob_ret_sim.corr(method='pearson')
#print(correlations)
#print(correlations2)

# statistics
model.summary()
score_treino_sim.describe()
score_trade_sim.describe()
ret_medio_ibov_sim.iloc[ini:fim].describe()
ret_medio_port_sim.iloc[ini:fim, :].describe()
ret_medio_port_long_sim.iloc[ini:fim, :].describe()
    dd_qtr_std=df_drawdowns.groupby(df_drawdowns.index.quarter).std()
     
    #Look at drawdowns on a monthly basis
    mth_mean=df_drawdowns.resample('M', how='mean',kind='period')
    dd_monthly_mean=df_drawdowns.groupby(df_drawdowns.index.month).mean()
    dd_monthly_std=df_drawdowns.groupby(df_drawdowns.index.month).std()
    
    #Look at one year-2014
    dd_2014=df_drawdowns['2014-01-01':'2014-12-31']
    dd_2014_ri=dd_2014.mean().reset_index(name='Average Drawdown in 2014')
    
    #Creates histograms based on drawdown magnitudes
    bins_dd = np.linspace(0,30,61)
    dd_hist=df_drawdowns
    dd_hist.hist(bins=bins_dd, alpha=0.75,color='green',normed=True)
    dd_hist.plot(kind='kde',style='k--')

    '''
    Drawdown analysis-This code plots a histogram of a stocks drawdown length
    characteristics.  Ensure that stock ticker used in this function has already
    been placed in the ticker list. 
    '''

    stock_dd_length=calc_drawdown_local('WAT',63)
    dd_len_hist=DataFrame(stock_dd_length)
    bins_len_dd=np.linspace(0,30,31)
    dd_len_hist.hist(bins=bins_len_dd, alpha=0.55, color='purple',normed=True)
    plt.title('Drawdown lengths - WAT') 
    dd_len_hist.describe()
    
    
Beispiel #28
0
print(len(list(
            dbclient.db_restT.user_tweets.aggregate(
                [{'$group': {'_id': '$user'}}]))), "users with tweets")


# And finally we plot the lexical diversity:

# In[38]:

cursor=dbclient.db_restT.diversity.aggregate([
    {'$project': {
            '_id': 0,
            'name': '$name',
            'lex_div': '$lexical_diversity'}}])
lex_div=DataFrame(list(cursor))
lex_div.hist('lex_div', bins=50)


# 2.3: Track unfollows
# --
# Write a python program to create a db called db_followers that stores all the followers for all the users that you find in task 2.1. Then, write a program to find the un-followed friends after a week for the top 10 users( users that have the highest number of followers in task 2.1) since the time that you extracted the tweets. In other words, you need to look for the people following the top 10 users at time X (the time that you extracted the tweets) and then look at the people following the same top 10 users at a later time Y (one-week after X) to see who stopped following the top 10 users.

# First, make a db/table of top RT'ed users in which to store follower stats

# In[39]:

dbclient.drop_database('db_followers')
rows=[]
for row in dbclient.db_tweets.top_retweets.aggregate([
        {'$group': {
                '_id': '$user.id',
Beispiel #29
0
 def test_hist_bins_legacy(self):
     df = DataFrame(np.random.randn(10, 2))
     ax = df.hist(bins=2)[0][0]
     self.assertEqual(len(ax.patches), 2)
Beispiel #30
0
    def test_hist(self):
        import matplotlib.pyplot as plt
        df = DataFrame(randn(100, 4))
        _check_plot_works(df.hist)
        _check_plot_works(df.hist, grid=False)

        # make sure layout is handled
        df = DataFrame(randn(100, 3))
        _check_plot_works(df.hist)
        axes = df.hist(grid=False)
        self.assert_(not axes[1, 1].get_visible())

        df = DataFrame(randn(100, 1))
        _check_plot_works(df.hist)

        # make sure layout is handled
        df = DataFrame(randn(100, 6))
        _check_plot_works(df.hist)

        # make sure sharex, sharey is handled
        _check_plot_works(df.hist, sharex=True, sharey=True)

        # handle figsize arg
        _check_plot_works(df.hist, figsize=(8, 10))

        # make sure xlabelsize and xrot are handled
        ser = df[0]
        xf, yf = 20, 20
        xrot, yrot = 30, 30
        ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        ytick = ax.get_yticklabels()[0]
        xtick = ax.get_xticklabels()[0]
        self.assertAlmostEqual(ytick.get_fontsize(), yf)
        self.assertAlmostEqual(ytick.get_rotation(), yrot)
        self.assertAlmostEqual(xtick.get_fontsize(), xf)
        self.assertAlmostEqual(xtick.get_rotation(), xrot)

        xf, yf = 20, 20
        xrot, yrot = 30, 30
        axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30)
        for i, ax in enumerate(axes.ravel()):
            if i < len(df.columns):
                ytick = ax.get_yticklabels()[0]
                xtick = ax.get_xticklabels()[0]
                self.assertAlmostEqual(ytick.get_fontsize(), yf)
                self.assertAlmostEqual(ytick.get_rotation(), yrot)
                self.assertAlmostEqual(xtick.get_fontsize(), xf)
                self.assertAlmostEqual(xtick.get_rotation(), xrot)

        tm.close()
        # make sure kwargs to hist are handled
        ax = ser.hist(normed=True, cumulative=True, bins=4)
        # height of last bin (index 5) must be 1.0
        self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0)

        tm.close()
        ax = ser.hist(log=True)
        # scale of y must be 'log'
        self.assertEqual(ax.get_yscale(), 'log')

        tm.close()

        # propagate attr exception from matplotlib.Axes.hist
        with tm.assertRaises(AttributeError):
            ser.hist(foo='bar')
    def test_hist_df_legacy(self):
        from matplotlib.patches import Rectangle
        _check_plot_works(self.hist_df.hist)

        # make sure layout is handled
        df = DataFrame(randn(100, 3))
        axes = _check_plot_works(df.hist, grid=False)
        self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
        self.assertFalse(axes[1, 1].get_visible())

        df = DataFrame(randn(100, 1))
        _check_plot_works(df.hist)

        # make sure layout is handled
        df = DataFrame(randn(100, 6))
        axes = _check_plot_works(df.hist, layout=(4, 2))
        self._check_axes_shape(axes, axes_num=6, layout=(4, 2))

        # make sure sharex, sharey is handled
        _check_plot_works(df.hist, sharex=True, sharey=True)

        # handle figsize arg
        _check_plot_works(df.hist, figsize=(8, 10))

        # check bins argument
        _check_plot_works(df.hist, bins=5)

        # make sure xlabelsize and xrot are handled
        ser = df[0]
        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        tm.close()
        # make sure kwargs to hist are handled
        ax = ser.hist(normed=True, cumulative=True, bins=4)
        # height of last bin (index 5) must be 1.0
        rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
        self.assertAlmostEqual(rects[-1].get_height(), 1.0)

        tm.close()
        ax = ser.hist(log=True)
        # scale of y must be 'log'
        self._check_ax_scales(ax, yaxis='log')

        tm.close()

        # propagate attr exception from matplotlib.Axes.hist
        with tm.assertRaises(AttributeError):
            ser.hist(foo='bar')
Beispiel #32
0
    def test_grouped_hist_legacy(self):
        from matplotlib.patches import Rectangle

        df = DataFrame(randn(500, 2), columns=['A', 'B'])
        df['C'] = np.random.randint(0, 4, 500)
        df['D'] = ['X'] * 500

        axes = grouped_hist(df.A, by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        axes = df.hist(by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        # group by a key with single value
        axes = df.hist(by='D', rot=30)
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        self._check_ticks_props(axes, xrot=30)

        tm.close()
        # make sure kwargs to hist are handled
        xf, yf = 20, 18
        xrot, yrot = 30, 40

        if _mpl_ge_2_2_0():
            kwargs = {"density": True}
        else:
            kwargs = {"normed": True}

        axes = grouped_hist(df.A,
                            by=df.C,
                            cumulative=True,
                            bins=4,
                            xlabelsize=xf,
                            xrot=xrot,
                            ylabelsize=yf,
                            yrot=yrot,
                            **kwargs)
        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
            height = rects[-1].get_height()
            tm.assert_almost_equal(height, 1.0)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        tm.close()
        axes = grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        self._check_ax_scales(axes, yaxis='log')

        tm.close()
        # propagate attr exception from matplotlib.Axes.hist
        with pytest.raises(AttributeError):
            grouped_hist(df.A, by=df.C, foo='bar')

        with tm.assert_produces_warning(FutureWarning):
            df.hist(by='C', figsize='default')
Beispiel #33
0
X = series.values
X = X.astype('float32')
train_size = int(len(X) * 0.50)
train, test = X[0:train_size], X[train_size:]
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    # difference data
    months_in_year = 12
    diff = difference(history, months_in_year)
    # predict
    model = ARIMA(diff, order=(0, 0, 1))
    model_fit = model.fit(trend='nc', disp=0)
    yhat = model_fit.forecast()[0]
    yhat = inverse_difference(history, yhat, months_in_year)
    predictions.append(yhat)
    # observation
    obs = test[i]
    history.append(obs)
# errors
residuals = [test[i] - predictions[i] for i in range(len(test))]
residuals = DataFrame(residuals)
print(residuals.describe())
# plot
pyplot.figure()
pyplot.subplot(211)
residuals.hist(ax=pyplot.gca())
pyplot.subplot(212)
residuals.plot(kind='kde', ax=pyplot.gca())
pyplot.show()
    def test_hist_layout(self):
        df = DataFrame(np.random.randn(100, 2))
        df[2] = to_datetime(
            np.random.randint(
                self.start_date_to_int64,
                self.end_date_to_int64,
                size=100,
                dtype=np.int64,
            ))

        layout_to_expected_size = (
            {
                "layout": None,
                "expected_size": (2, 2)
            },  # default is 2x2
            {
                "layout": (2, 2),
                "expected_size": (2, 2)
            },
            {
                "layout": (4, 1),
                "expected_size": (4, 1)
            },
            {
                "layout": (1, 4),
                "expected_size": (1, 4)
            },
            {
                "layout": (3, 3),
                "expected_size": (3, 3)
            },
            {
                "layout": (-1, 4),
                "expected_size": (1, 4)
            },
            {
                "layout": (4, -1),
                "expected_size": (4, 1)
            },
            {
                "layout": (-1, 2),
                "expected_size": (2, 2)
            },
            {
                "layout": (2, -1),
                "expected_size": (2, 2)
            },
        )

        for layout_test in layout_to_expected_size:
            axes = df.hist(layout=layout_test["layout"])
            expected = layout_test["expected_size"]
            self._check_axes_shape(axes, axes_num=3, layout=expected)

        # layout too small for all 4 plots
        msg = "Layout of 1x1 must be larger than required size 3"
        with pytest.raises(ValueError, match=msg):
            df.hist(layout=(1, 1))

        # invalid format for layout
        msg = re.escape("Layout must be a tuple of (rows, columns)")
        with pytest.raises(ValueError, match=msg):
            df.hist(layout=(1, ))
        msg = "At least one dimension of layout must be positive"
        with pytest.raises(ValueError, match=msg):
            df.hist(layout=(-1, -1))
Beispiel #35
0
def legacy_func():
    """Old function used for PAN18"""

    results = DataFrame()

    # experiment_names = ('N-grams', 'N-grams, LSA 300')
    # results[experiment_names[0]] = [0.8, 0.82222222, 0.80555556, 0.82222222, 0.82222222, 0.79444444, 0.87777778, 0.85, 0.81666667, 0.77777778]
    # results[experiment_names[1]] = [0.82777778, 0.84444444, 0.84444444, 0.83888889, 0.81111111, 0.8, 0.88888889, 0.82777778, 0.78333333, 0.81666667]
    #
    # # %%% TEMP: Testing
    # temp_list = []
    # for item in results[experiment_names[0]]:
    #     temp_list.append(item - 0.02)
    # results[experiment_names[0]] = temp_list
    #
    experiment_names = ('Word and char n-grams, No LSA (English)',
                        'Word n-grams, No LSA (English)')
    results[experiment_names[0]] = [
        0.8, 0.79444444, 0.82222222, 0.87222222, 0.76111111, 0.85, 0.82222222,
        0.85555556, 0.81111111, 0.81111111
    ]
    results[experiment_names[1]] = [
        0.8, 0.78333333, 0.81111111, 0.85, 0.77222222, 0.81666667, 0.80555556,
        0.82777778, 0.8, 0.8
    ]

    # experiment_names = ('Word and char n-grams, No LSA (Arabic)', 'Char n-grams, No LSA (Arabic)')
    # results[experiment_names[0]] = [0.83333333, 0.77777778, 0.81111111, 0.76666667, 0.81111111, 0.87777778, 0.81111111, 0.77777778, 0.81111111, 0.75555556]
    # results[experiment_names[1]] = [0.77777778, 0.75555556, 0.83333333, 0.74444444, 0.78888889, 0.82222222, 0.82222222, 0.75555556, 0.77777778, 0.71111111]

    # Descriptive stats
    print(results.describe())

    # Box and whisker plot
    results.boxplot()
    plt.show()

    # Histogram plot
    results.hist()
    plt.show()

    # Normality test
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html
    alpha = 0.05
    print("Confidence level = {}%".format((1 - alpha) * 100))
    for experiment_name in experiment_names:
        statistic, p = normaltest(results[experiment_name])
        # ↳ Null hypothesis: the sample comes from a normal distribution
        print(
            "{}: (skewtest z-score)^2 + (kurtosistest z-score)^2 = {}, p = {}".
            format(experiment_name, statistic, p))
        if p < alpha:
            # The null hypothesis can be rejected
            print(
                "No: It is unlikely that the sample comes from a Gaussian (normal) distribution"
            )
        else:
            print(
                "Yes: It is likely that the sample comes from a Gaussian (normal) distribution"
            )

    # T-test: Compare means for Gaussian samples
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
    alpha = 0.05
    print("Confidence level = {}%".format((1 - alpha) * 100))
    equal_variances = True
    statistic, p = ttest_ind(results[experiment_names[0]],
                             results[experiment_names[1]],
                             equal_var=equal_variances)
    # ↳ Null hypothesis: The two samples have identical average (expected) values.
    if equal_variances:
        statistic_name = "t-test statistic"
    else:
        statistic_name = "Welch’s t-test statistic"

    print("{} = {}, p = {}".format(statistic_name, statistic, p))
    if p < alpha:
        # The null hypothesis can be rejected
        print(
            "Significant difference between the means: Samples are likely drawn from different distributions"
        )
    else:
        print(
            "No significant difference between the means: Samples are likely drawn from the same distribution"
        )
    def test_hist_df_legacy(self):
        from matplotlib.patches import Rectangle

        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(self.hist_df.hist)

        # make sure layout is handled
        df = DataFrame(np.random.randn(100, 2))
        df[2] = to_datetime(
            np.random.randint(
                self.start_date_to_int64,
                self.end_date_to_int64,
                size=100,
                dtype=np.int64,
            ))
        with tm.assert_produces_warning(UserWarning):
            axes = _check_plot_works(df.hist, grid=False)
        self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
        assert not axes[1, 1].get_visible()

        _check_plot_works(df[[2]].hist)
        df = DataFrame(np.random.randn(100, 1))
        _check_plot_works(df.hist)

        # make sure layout is handled
        df = DataFrame(np.random.randn(100, 5))
        df[5] = to_datetime(
            np.random.randint(
                self.start_date_to_int64,
                self.end_date_to_int64,
                size=100,
                dtype=np.int64,
            ))
        with tm.assert_produces_warning(UserWarning):
            axes = _check_plot_works(df.hist, layout=(4, 2))
        self._check_axes_shape(axes, axes_num=6, layout=(4, 2))

        # make sure sharex, sharey is handled
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.hist, sharex=True, sharey=True)

        # handle figsize arg
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.hist, figsize=(8, 10))

        # check bins argument
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.hist, bins=5)

        # make sure xlabelsize and xrot are handled
        ser = df[0]
        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        xf, yf = 20, 18
        xrot, yrot = 30, 40
        axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        tm.close()

        ax = ser.hist(cumulative=True, bins=4, density=True)
        # height of last bin (index 5) must be 1.0
        rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
        tm.assert_almost_equal(rects[-1].get_height(), 1.0)

        tm.close()
        ax = ser.hist(log=True)
        # scale of y must be 'log'
        self._check_ax_scales(ax, yaxis="log")

        tm.close()

        # propagate attr exception from matplotlib.Axes.hist
        with tm.external_error_raised(AttributeError):
            ser.hist(foo="bar")
Beispiel #37
0
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###

plot_df = DataFrame(np.random.randn(1000, 2), columns=["x", "y"])
plot_df["y"] = plot_df["y"].map(lambda x: x + 1)

plot_df.plot()  ### plot not working???? ###

plot_df.hist()  ### plot not working???? ###

### 10. Scikit-learn conversion ###

print(df)

print(df.values[:, :-1])

# print(df.values[:,:-1].astype(float32)) not working?

input()
Beispiel #38
0
 def _save(self, df: pd.DataFrame) -> None:
     save_args = self._save_args
     savefig_args = save_args.pop("savefig_args", {})
     df.hist(**save_args)
     plt.savefig(fname=self._filepath, **savefig_args)
print df
print "añadimos columnas combinando las actuales"
df["C"] = df["A"]+df["B"]
df["D"] = df["A"]*3
df["E"] = np.sqrt(df["A"])
print df
print "*"*15
print "Datos disponibles de un dataframe"
print " descripcion del dataframe"
print df.describe()
print " covarianza "
print df.cov()
print " correlación "
print df.corr()
print "*"*15

print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas "
print " DataFrame(np.random.randn(1000,2),columns=['x','y'])"
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
print plot_df
print "Mostramos las graficas"
plot_df.plot()
plot_df.hist()







    
                df.loc[l[i], 'Growth'] = (df.loc[l[i-1], 'Growth'] + trade_value) * (1 + df.loc[l[i], 'IVWReturn'])
                df.loc[l[i], 'Value'] = (df.loc[l[i-1], 'Value'] - trade_value) * (1 + df.loc[l[i], 'IVEReturn'])
                df.loc[l[i], 'InvestmentTotal'] = df.loc[l[i], 'Value'] + df.loc[l[i], 'Growth'] 
                df.loc[l[i], 'Growth%'] = df.loc[l[i],'Growth']/df.loc[l[i],'InvestmentTotal']
                df.loc[l[i], 'Value%'] = df.loc[l[i],'Value']/df.loc[l[i], 'InvestmentTotal']
                df.loc[l[i], 'Total%'] = df.loc[l[i], 'Growth%'] + df.loc[l[i], 'Value%']

    final_value = df.loc[l[len(l)-1], 'InvestmentTotal'] - df.loc[l[len(l)-1],'SP500Total']
    if math.isnan(final_value) == True:
        print("result removed it was nan")
        test = test + 1 
    else:
        results.append(final_value)
    print(results)
    test = test - 1 

dg = pd.Series(results, name = 'Results')
dff = DataFrame(dg)
print(dff) 
dff.hist()
plt.show() 
file = ExcelWriter('ValueGrowth.xlsx')
df.to_excel(file, 'Data')
file.close()
os.startfile('ValueGrowth.xlsx')

df.plot(y = ['SP500Total', 'InvestmentTotal'])
plt.show()   


# In[12]:

Y=DataFrame(Y)


# In[13]:

Y.head(1)


### Univariate analysis

# In[14]:

X.hist()


##### These histograms depicts the distribution of the 4 independent variables

##### We can do this analysis using just 1 variable also 

# In[15]:

X[0].hist()


##### we can also get the stats of that variable

# In[16]:
Beispiel #42
0
    model = ARIMA(diff, order=(6,1,2))
    model_fit = model.fit(trend='nc', disp=0)
    yhat = model_fit.forecast()[0]
    yhat = inverse_difference(history, yhat, days_in_month)
    predictions.append(yhat)
    # observation
    obs = test[i]
    history.append(obs)
# errors
residuals = [test[i]-predictions[i] for i in range(len(test))]
residuals = DataFrame(residuals)
print(residuals.describe())
# plot
plt.figure(figsize=(36, 36))
plt.subplot(211)
residuals.hist(ax=plt.gca())
plt.subplot(212)
residuals.plot(kind='kde', ax=plt.gca())
plt.show()




# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff
Beispiel #43
0
def show_histogram(df: pd.DataFrame, column):
    df.hist(column=column)
    plt.show()
 def test_hist_bins_legacy(self):
     df = DataFrame(np.random.randn(10, 2))
     ax = df.hist(bins=2)[0][0]
     self.assertEqual(len(ax.patches), 2)
 def test_histtype_argument(self, histtype, expected):
     # GH23992 Verify functioning of histtype argument
     df = DataFrame(np.random.randint(1, 10, size=(100, 2)),
                    columns=["a", "b"])
     ax = df.hist(by="a", histtype=histtype)
     self._check_patches_all_filled(ax, filled=expected)
Beispiel #46
0
def decile_for_each(df: pd.DataFrame, columns_for_show: list, decile: int):
    df.hist(column=columns_for_show, bins=50, figsize=(20, 20))
    def test_grouped_hist_legacy(self):
        from matplotlib.patches import Rectangle

        from pandas.plotting._matplotlib.hist import _grouped_hist

        df = DataFrame(np.random.randn(500, 1), columns=["A"])
        df["B"] = to_datetime(
            np.random.randint(
                self.start_date_to_int64,
                self.end_date_to_int64,
                size=500,
                dtype=np.int64,
            ))
        df["C"] = np.random.randint(0, 4, 500)
        df["D"] = ["X"] * 500

        axes = _grouped_hist(df.A, by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        axes = df.hist(by=df.C)
        self._check_axes_shape(axes, axes_num=4, layout=(2, 2))

        tm.close()
        # group by a key with single value
        axes = df.hist(by="D", rot=30)
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        self._check_ticks_props(axes, xrot=30)

        tm.close()
        # make sure kwargs to hist are handled
        xf, yf = 20, 18
        xrot, yrot = 30, 40

        axes = _grouped_hist(
            df.A,
            by=df.C,
            cumulative=True,
            bins=4,
            xlabelsize=xf,
            xrot=xrot,
            ylabelsize=yf,
            yrot=yrot,
            density=True,
        )
        # height of last bin (index 5) must be 1.0
        for ax in axes.ravel():
            rects = [x for x in ax.get_children() if isinstance(x, Rectangle)]
            height = rects[-1].get_height()
            tm.assert_almost_equal(height, 1.0)
        self._check_ticks_props(axes,
                                xlabelsize=xf,
                                xrot=xrot,
                                ylabelsize=yf,
                                yrot=yrot)

        tm.close()
        axes = _grouped_hist(df.A, by=df.C, log=True)
        # scale of y must be 'log'
        self._check_ax_scales(axes, yaxis="log")

        tm.close()
        # propagate attr exception from matplotlib.Axes.hist
        with tm.external_error_raised(AttributeError):
            _grouped_hist(df.A, by=df.C, foo="bar")

        msg = "Specify figure size by tuple instead"
        with pytest.raises(ValueError, match=msg):
            df.hist(by="C", figsize="default")
Beispiel #48
0
})

#stats
df1.describe()  #only shows numbers

#gh.ix[:,['float_col', 'int_col']] less elegant
df1[['float_col', 'int_col']]

df1.fillna(value="waiting")

df1['div_col'] = df1['float_col'] / df1['int_col']

mean = df1['rev_col'].mean()
df1['mean_col'] = mean

new = pd.merge(df1, df2, how='outer', on='str_col')

#quick plotting
import numpy as np
plot_df = DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])

plot_df.hist()
plot_df.plot()

#series object 1-dimensional
days = ['mon', 'tues', 'weds', 'thurs', 'fri', 'sat', 'sun']
ratings = ['meh', 'erg', 'ugh', 'ok', 'alright', 'yauh', "d'oh"]

s1 = Series(days, ratings, name="what days are")
#cool to make data with date_range
s2 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
# Residual Plot
#
# We would expect the plot to be random around the value of 0 and not show any trend or cyclic structure.
# we are interested in the mean value of the residual errors. A value close to zero suggests no bias in the forecasts, whereas positive and negative values suggest a positive or negative bias in the forecasts made.

# In[58]:

# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())
# histogram plot
residuals.hist()
plt.show()

# Residual Statistics shows a mean error value close to zero(0.06), but perhaps not close enough.

# In[59]:

autocorrelation_plot(residuals)
plt.show()

# # AUTO ARIMAX

# In[131]:

split = len(
    ts_df_key) * 0.80  #split data in test and train 20% and 80% respectively.