def test_boxplot_legacy2(self):
        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X', ax=ax, return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
Example #2
0
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot, column=[
                          'one', 'two'], return_type='dict')
        # _check_plot_works adds an ax so catch warning. see GH #13188
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, column=['one', 'two'],
                              by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting._core.boxplot, data=df['one'],
                          return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X', ax=ax, return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
    def test_grouped_box_return_type(self):
        df = self.hist_df

        # old style: return_type=None
        result = df.boxplot(by='gender')
        self.assertIsInstance(result, np.ndarray)
        self._check_box_return_type(result, None,
                                    expected_keys=['height', 'weight', 'category'])

        # now for groupby
        with tm.assert_produces_warning(FutureWarning):
            result = df.groupby('gender').boxplot()
        self._check_box_return_type(result, 'dict', expected_keys=['Male', 'Female'])

        columns2 = 'X B C D A G Y N Q O'.split()
        df2 = DataFrame(random.randn(50, 10), columns=columns2)
        categories2 = 'A B C D E F G H I J'.split()
        df2['category'] = categories2 * 5

        for t in ['dict', 'axes', 'both']:
            returned = df.groupby('classroom').boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=['A', 'B', 'C'])

            returned = df.boxplot(by='classroom', return_type=t)
            self._check_box_return_type(returned, t,
                                        expected_keys=['height', 'weight', 'category'])

            returned = df2.groupby('category').boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=categories2)

            returned = df2.boxplot(by='category', return_type=t)
            self._check_box_return_type(returned, t, expected_keys=columns2)
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot, column=[
                          'one', 'two'], return_type='dict')
        _check_plot_works(df.boxplot, column=['one', 'two'], by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        _check_plot_works(df.boxplot, by='indic')
        _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        self.assertIs(ax.get_axes(), axes)

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        self.assertIs(ax.get_axes(), axes['A'])

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        axes = df.boxplot(column=['Col1', 'Col2'],
                          by='X', ax=ax, return_type='axes')
        self.assertIs(axes['Col1'].get_figure(), fig)

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        self.assertEqual(len(ax.get_lines()), len(lines))
    def test_boxplot_return_type_legacy(self):
        # API change in https://github.com/pydata/pandas/pull/7096
        import matplotlib as mpl  # noqa

        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        with tm.assertRaises(ValueError):
            df.boxplot(return_type='NOTATYPE')

        with tm.assert_produces_warning(FutureWarning):
            result = df.boxplot()
        # change to Axes in future
        self._check_box_return_type(result, 'dict')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='dict')
        self._check_box_return_type(result, 'dict')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='axes')
        self._check_box_return_type(result, 'axes')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='both')
        self._check_box_return_type(result, 'both')
def boxplot_path_by_outcome(trades, day):
    tf = trades.trade_frame(compacted = False, cumulative = False)
    # Get the daily returns from the day after the requested day onwards.
    # Remove any trades which are empty moving forward, as we know these would have been closed.
    forward = tf.loc[:, (day + 1):].dropna(how = 'all')
    forward = log(forward + 1)
    backward = tf.loc[forward.index, :day]
    backward = log(backward + 1)

    df = DataFrame(dtype = float)
    df['Final Return'] = qcut(forward.sum(axis = 1).round(2), 5)
    df['Current Return'] = backward.sum(axis = 1)

    bp = df.boxplot('Current Return', by = 'Final Return', return_type = 'dict')

    whisker_points = []
    [whisker_points.extend(list(whisker.get_ydata())) for whisker in bp[0]['whiskers']]
    y_min = min(whisker_points) * 1.1
    y_max = max(whisker_points) * 1.1
    plt.ylim((y_min, y_max))
    plt.xticks(fontsize = 'small', rotation = 30)
    plt.ylabel('Current Return')
    plt.title('Day {}'.format(day))
class Dataset:
    """
    This class holds an object that stores all the tables and the results
    of the analysis.

    To access them once the analysis is over do:
    data = Dataset()
    data.df: for the raw word frequency data
    data.cdb for the table holding clusters and dep. vars.
    data.top_words for the table of most used words.
    data.desc_stat for a table of descriptive statistics for each cluster
    data.reg.results for the regression results stored as statsmodels
                     regressionResults objects

    You can show plots and print reg. results by doing
    data.show_plots()
    data.regression_results()
    """

    def __init__(self):
        self.tf_idf = DataFrame()
        self.df = DataFrame()
        self.cdb = DataFrame()
        self.top_words = DataFrame()
        self.desc_stat = DataFrame()
        self.reg_results = []
        self.multi_results = DataFrame()

    def create(self, paths, country_names, save_file="", clean=True,
               stopwords_path="../data/stopwords.csv", display_progress=False):

        # Create progress bar, Pbar class will handle import and
        # wheter or not to display.
        bar = Pbar(displayProgress)
        bar.create("Generating csv dataset...", len(paths))

        # Init database with as many rows as there are countries
        self.df = DataFrame(countryNames, columns=["country_id"])
        self.df['tot_terms'] = 0

        # A countre to keep track of which row we are on
        cnt = 0
        for p in paths:

            bar.update(cnt)

            c = load_constitution(p)
            frequencies = get_frequency(c)

            # Add number of words to each constitution
            self.df.loc[cnt, 'tot_terms'] = len(frequencies.keys())

            for word in frequencies.keys():
                # Initialize all words that have not appeared in other
                # constitutions to frequency 0
                if word not in self.df.columns:
                    self.df[word] = 0

                self.df[word][cnt] = frequencies[word]

            cnt += 1

        bar.finish()

        if(saveFile != ""):
            print "Saving dataset to csv file..."
            self.df.to_csv(saveFile, index=False)

        if(clean):
            self.clean(stopwordsPath, display_progress)

    def load(self, path, stopwords="../data/stopwords.csv", clean=True,
             display_progress=False):
        self.df = read_csv(path)
        if clean:
            self.clean(stopwords, display_progress)

    def clean(self, stopwords_path, display_progress=False):
        bar = Pbar("Cleaning dataset...", len(self.df.columns))

        with open(stopwordsPath, 'r+') as sw_file:
            stopwords = sw_file.read().split(',')

        numbers = [str(n) for n in range(10)]
        i = 0
        for c in self.df.columns:
            # Remove all words which meet the following conditions
            if c[0] in numbers or c in stopwords or not self.df[c].any > 0:
                self.df = self.df.drop(c, axis=1)

            bar.update(i)
            i += 1

        bar.finish()

    def build_tfidf_table(self):
        self.tf_idf = DataFrame()

        # Exclude country name and total words from data
        tf = self.df.ix[:, 2:]

        # To create the tf term, divide each row by the number of words
        # that appear in that country's constitution.
        for r in range(len(self.df)):
            tf.loc[r, :] = tf.loc[r, :] / self.df.loc[r, 'tot_terms']

        # To create idf, divide the number of documents by the number
        # of documents containing each word.
        # The operation here is vectorized using numpy arrays.
        # The number of documents containing each word is obtained by summing
        # a vector of bools where the documents in which the word has freq. > 0
        # are labeled true.
        idf = np.log(len(self.df.index) /
                     (self.df[self.df.ix[:, 2:] > 0].sum(axis=0))+1)

        self.tf_idf = tf*idf

        # Drop country and tot words columns from table in case they are still
        # there.
        if('country_id' in self.tf_idf.columns):
            self.tf_idf = self.tf_idf.drop('country_id', axis=1)
        if'tot_terms' in self.tf_idf.columns:
            self.tf_idf = self.tf_idf.drop('tot_terms', axis=1)

    def get_cluster(self, c_id, clusterCol='kmeans'):
        if c_id not in self.cdb[clusterCol]:
            raise KeyError("Selected cluster not in dataset")

        return self.cdb[self.cdb[clusterCol] == c_id]

    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]

    def get_word_avg(self, countries, word, tf_idf=False):
        w = 0
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    w += self.tf_idf.loc[r, word]
                else:
                    w += self.df.loc[r, word]
        return w/len(countries)

    def build_topwords_table(self, cluster_col="kmeans", thresh=10, raw=True):
        if len(self.cdb) == 0:
            raise Exception("Cluster database not initialized")

        # get themnames of all the clusters created
        labels = list(set(self.cdb[cluster_col]))

        self.top_words = DataFrame({'cluster': labels})
        for l in labels:
            countries = [c for c in self.get_cluster(l)['country']]
            tw = self.get_topwords(countries, thresh, tf_idf=(not raw))
            idx = self.top_words[self.top_words['cluster'] == l].index

            for w in tw.index:
                if w not in self.top_words.columns:
                    self.top_words[w] = 0
                self.top_words.loc[idx, w] = tw[w]

        for r in range(len(self.top_words)):
            countries = [c for c in self.get_cluster(self.top_words.loc[r,
                                                     'cluster'])['country']]
            for w in self.top_words.columns:
                if w != 'cluster' and self.top_words.loc[r, w] == 0:
                    self.top_words.loc[r, w] = self.get_word_avg(countries, w,
                                                            tf_idf=(not raw))

    def build_descstat_table(self, cluster_col="kmeans",
                             cols=['fh_score', 'LJI', 'fragility'],
                             na_cols=['fragility']):
        if len(self.cdb) == 0:
            raise Exception("Cluster database not initialized")

        labels = list(set(self.cdb[cluster_col]))
        # This weird list comprehension creates the labels for each colums of
        # the descstat table by pasting strings.
        col_labels = sum([[c + '_mean', c + '_median', c + '_std']
                          for c in cols], [])
        self.descStat = DataFrame(columns=['cluster'] + col_labels)

        for l in labels:
            row = [l]
            cluster = self.getCluster(l)

            for c in cols:
                if c in na_cols:
                    row.append(cluster[cluster[c] != 'NA'][c].mean())
                    row.append(cluster[cluster[c] != 'NA'][c].median())
                    row.append(cluster[cluster[c] != 'NA'][c].std())
                else:
                    row.append(cluster[c].mean())
                    row.append(cluster[c].median())
                    row.append(cluster[c].std())

            self.descStat.loc[l] = row

    def regression_results(self):
        if not self.reg_results:
            raise Error("Tried to access regression results before running\
                            regressions")

        for r in self.reg_results:
            print r.summary()

    def make_plots(self, save=False):
        if not self.cdb:
            raise Error("Tried to build plots with empty cluster table")

        plt.figure(1)
        self.cdb.boxplot(column="fh_score", by="kmeans")
        if save:
            plt.savefig("../output/img/FH.png")

        plt.figure(2)
        self.cdb.boxplot(column="LJI", by="kmeans")
        if save:
            plt.savefig("../output/img/LJ).png")

        plt.figure(3)
        self.cdb[self.cdb['fragility'] != 'NA'].boxplot(column="fragility",
                                                        by="kmeans")
        if save:
            plt.savefig("../output/img/SFI).png")

        if not save:
            plt.show()

    def show_plots(self):
        self.make_plots()
        plt.show()
 def test_fontsize(self):
     df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]})
     self._check_ticks_props(df.boxplot("a", by="b", fontsize=16),
                             xlabelsize=16, ylabelsize=16)
 def test_figsize(self):
     df = DataFrame(np.random.rand(10, 5),
                    columns=['A', 'B', 'C', 'D', 'E'])
     result = df.boxplot(return_type='axes', figsize=(12, 8))
     assert result.figure.bbox_inches.width == 12
     assert result.figure.bbox_inches.height == 8
# In[15]:

X[0].hist()


##### we can also get the stats of that variable

# In[16]:

X[0].describe()


# In[22]:

X.boxplot(column=[0,1,2,3])


##### This is what is called a boxplot or a tail and a whisker diagram which gives the kind of distribution of each variable under study 

##### The red line in the boxplot gives the mean of that variable  The length of the box gives the spread of the variable  The minimum and maximum values are represented by the whisker ends The box endings are the lower and upper quantiles 

# In[38]:

X[4]=0
X.head(1)


# In[40]:

X[4]=Y
np.max(t_grouped)-np.min(t_grouped)
##186.398124


import matplotlib
matplotlib.style.use('ggplot')

t_grouped = frame.groupby(['District'])[['ResponseTime']].mean()
ax=t_grouped.plot(kind='bar',legend=False,title='The Response Times in Each Police District')
ax.set_ylabel('The Means of Response Times(secs)')
ax.set_xlabel('The Police District')
ax.axhline(np.mean(delta2), color='k',linestyle='dashed')
plt.savefig('C:/Users/tianyu/Desktop/0415/means.png', dpi=400, bbox_inches='tight')

frame=frame[frame['Response Time']>0]
bp = frame.boxplot(by='District',sym='', meanline=True,figsize=(6,6))
bp.set_ylim([-10, 2000])
bp.set_xlabel('Police District')
plt.savefig('C:/Users/tianyu/Desktop/0415/box.png', dpi=400, bbox_inches='tight')


##We can define surprising event types as those that occur more often in a district than 
##they do over the whole city. What is the largest ratio of the conditional probability
##of an event type given a district to the unconditional probably of that event type? 
##Consider only events types which have more than 100 events. Note that some events have 
##their locations anonymized and are reported as being in district "0". These should be ignored.
clean_data=data_sum[['Type_','PoliceDistrict']][data_sum['PoliceDistrict']!=0]
type_counts = clean_data['Type_'].value_counts()
types=type_counts[type_counts>100].index
maxfr=0.0
for t in types:
        for host in hosts:
            data = {
                "os": osystem,
                "host": host,
                "kind": kind
            }
            data["name"] = "{os}_{kind}_{host}.csv".format(**data)
            if os.path.isfile(data["name"]):
                d = pd.read_csv("{name}".format(**data))
                for v in d.real:
                    data["value"] = v
                    values.append([data['value'], "{os}\n{host}".format(**data)])

    pprint(values)

    df = DataFrame(values)
    df.columns = ['Time in s', 'Host']
    print(df)

    df.boxplot(column='Time in s',
               by='Host',
               rot=0)

    plt.suptitle('Performance Comparison OpenFace: {}'.format(kind))

    pdf = "boxplot-{}.png".format(kind)
    plt.savefig(pdf)
    plt.close()

    os.system("open {}".format(pdf))