Exemple #1
0
def test_cython_api2():

    # this takes the fast apply path

    # cumsum (GH5614)
    df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]],
                   columns=["A", "B", "C"])
    expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]],
                         columns=["B", "C"])
    result = df.groupby("A").cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 5755 - cumsum is a transformer and should ignore as_index
    result = df.groupby("A", as_index=False).cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 13994
    result = df.groupby("A").cumsum(axis=1)
    expected = df.cumsum(axis=1)
    tm.assert_frame_equal(result, expected)
    result = df.groupby("A").cumprod(axis=1)
    expected = df.cumprod(axis=1)
    tm.assert_frame_equal(result, expected)
Exemple #2
0
def pandas_df_demo06():
    # 1
    df = DataFrame(np.arange(9).reshape(3, 3),
                   index=['bj', 'sh', 'gz'], columns=['a', 'b', 'c'])
    print('dataframe:\n', df)

    df.index = Series(['beijing', 'shanghai', 'guangzhou'])
    print('\nupdate index df:\n', df)
    df.index = df.index.map(str.upper)
    print('\nupdate index with upper df:\n', df)

    df1 = df.rename(index=str.lower, columns=str.upper)
    print('\nupdate index and cols df:\n', df1)

    # 2
    df2 = DataFrame([
        [2.0, 1.0, 3.0, 5],
        [3.0, 4.0, 5.0, 5],
        [3.0, 4.0, 5.0, 5],
        [1.0, 0.0, 6.0, 5]],
        columns=list('abcd'))
    print('\nsum by rows df:\n', df2.cumsum(axis=0))  # default
    print('\nsum by cols df:\n', df2.cumsum(axis=1))
def test_cython_api2():

    # this takes the fast apply path

    # cumsum (GH5614)
    df = DataFrame(
        [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
         ], columns=['A', 'B', 'C'])
    expected = DataFrame(
        [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
    result = df.groupby('A').cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 5755 - cumsum is a transformer and should ignore as_index
    result = df.groupby('A', as_index=False).cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 13994
    result = df.groupby('A').cumsum(axis=1)
    expected = df.cumsum(axis=1)
    tm.assert_frame_equal(result, expected)
    result = df.groupby('A').cumprod(axis=1)
    expected = df.cumprod(axis=1)
    tm.assert_frame_equal(result, expected)
Exemple #4
0
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
obj.value_counts()
# 统计个数后默认排序,也可以不排序
pd.value_counts(obj.values, sort=False)
# 判断是否存在
import numpy as np
import pandas as pd
from numpy.random import randn
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
array1=np.array([[10,np.nan,20],[30,40,np.nan]])
print(array1)
df1=DataFrame(array1,index=[1,2],columns=list('ABC'))
print(df1)
print(df1.sum())
print(df1.sum(axis=1))
print(df1.min())
print(df1.max())
print(df1.idxmax())
print(df1.cumsum())
print(df1.describe())
df2=DataFrame(randn(9).reshape(3,3),index=[1,2,3],columns=list('ABC'))
print(df2)
plt.plot(df2)
plt.legend(df2.columns,loc="lower right")
plt.savefig("samplepic.png")
plt.show()

ser=Series(list('abcccaabd'))
print(ser)
print(ser.unique())
print(ser.value_counts())
Exemple #6
0
######################################################################

# Will return the sum for each colm, ignores NaN
dframe.sum()

# Will return sum of Rows
dframe.sum(axis=1)

# Min/max val for each col
dframe.min()

# Min/max val index for each col
dframe.idxmin()

# Cumulation sum
dframe.cumsum()

# Describe method creates summary statistics for each colm
dframe.descirbe() 	#count, mean, std, min, ....

# Covariance and Correlation
import pandas.io.data as pdweb
import datetime


# Getting the stock data from the internet and displaying the first 5 sets
prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],start=datetime.datetime(2010,1,1),
	end=datetime.datetime(2013,1,1))['Adj Close']
prices.head()

Exemple #7
0
 def test_cumsum_corner(self):
     dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5))
     # TODO(wesm): do something with this?
     result = dm.cumsum()  # noqa
Exemple #8
0
# In[ ]:


df.idxmax()  #各列最大值的index


# In[ ]:


df


# In[ ]:


df.cumsum()   #每列按行累加


# In[ ]:


df.describe()   #给出常见的统计量


# In[ ]:


s = Series(['a', 'a', 'b', 'c'] * 4)
s

df
'''
   one  two
a  1.0  NaN
b  7.0  4.0
c  NaN  NaN
d  0.0  1.0
'''
print
df.idxmax()  # 计算每一列最大值的索引
'''
one    b
two    b
'''
print
df.cumsum()  # 每一列的累加和
'''
   one  two
a  1.0  NaN
b  8.0  4.0
c  NaN  NaN
d  8.0  5.0
'''
print
df.describe()  # 对DataFrame每列计算汇总统计
'''
            one      two
count  3.000000  2.00000
mean   2.666667  2.50000
std    3.785939  2.12132
min    0.000000  1.00000
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt

from numpy.random import randn

ts = pd.Series(randn(52), \
               index=pd.date_range('1/1/2016', periods=52, freq='W'))


df = DataFrame(randn(52,5), \
               index=ts.index, \
               columns=list('ABCDE'))
df.cumsum().plot()


# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
)["Adj Close"]
prices.head()
Exemple #12
0
##### Summary
arr = np.array([[1,2,np.nan],[np.nan,3,4]])
arr
df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three'])
df1

df1.sum() #default axis is 0, and Pandas will ignore the nan values
df1.sum(axis=1) #by row

df1
df1.min()
df1.min(axis=1)
df1.idxmin()
df1.idxmin(axis=1)

df1.cumsum() # accumulation sum

### unique() and value_count() methods for factor variables
ser1 = Series(['w','w','x','y','z','w','x','y','x','a'])
ser1
ser1.unique()
ser1.value_counts()

###describe method
df1.describe() # similar to the summary() method in R will provide summary stat.

###covariance matrices and some visulaization
import pandas.io.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(['CVX','XOM','BP'],start = datetime.datetime(2010,1,1),
Exemple #13
0
__author__ = 'Executor'

import numpy as np
import pandas as pa
from pandas import Series, DataFrame


arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])
print(dframe1.sum())
print(dframe1.sum(axis=1))
print(dframe1.min())
print(dframe1)
print(dframe1.idxmin())

print(dframe1)
print(dframe1.cumsum())

print(dframe1.describe())

from IPython.display import YouTubeVideo
YouTubeVideo('xGbpuFNR1ME')
YouTubeVideo('4EXNedimDMs')

''' stupid thing doesn't work!'''
Exemple #14
0
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print '求和'
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print df
print df.sum()  # 按列求和
print df.sum(axis = 1)  # 按行求和
print

print '平均数'
print df.mean(axis = 1, skipna = False)
print df.mean(axis = 1)
print

print '其它'
print df.idxmax()
print df.cumsum()
print df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
print obj.describe()
               columns=['one','two']
               )

print(df)
print('\n')
print(df.sum())
print('\n')
print(df.sum(axis=1))
print('\n')
print(df.mean())
print('\n')
print(df.mean(axis=1,skipna=False))
print('\n')
print(df.idxmax())
print('\n')
print(df.cumsum())
print('\n')
print(df.cumsum(axis=1))
print('\n')
print(df.describe())
print('\n')

###############################################################

obj = Series(['a','a','b','c']*4)
print(obj)
print(obj.describe())
print('\n')

###############################################################
Exemple #16
0
import numpy as np
import pandas as pd
import IPython
from pandas import Series, DataFrame
arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three'])
print(dframe1)
print(dframe1.sum())  # sums values across each column
print(
    dframe1.sum(axis=0)
)  # sums the value across each row. for Row use axis =1 , for column use axis =0
print(dframe1.min())  # returns min value in column
print(dframe1.max())  # returns max value in column
print(dframe1.idxmin())  # returns index of the min value in column
print(
    dframe1.cumsum())  #accumulation row wise cumulative summin across column.
print(dframe1.describe()
      )  #summary statiscs for data frame . Min , Max , ount , percentile
# from IPython.display import YouTubeVideo
# YouTubeVideo('xGbpuFNR1ME')
# YouTubeVideo('4EXNedimDMs')
from pandas_datareader import data  #allow us to get some information from the web
import datetime  # Library for date input
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
prices = data.get_data_yahoo(
    ['CVX', 'XOM', 'BP'],
    start=datetime.datetime(2010, 1, 1),
    end=datetime.datetime(2013, 1,
                          1))['Adj Close']  #get stock price at that time
Exemple #17
0
print '其它函数'
print df
'''
   one  two
a  1.0  NaN
b  7.0  4.0
c  NaN  NaN
d  0.0  1.0
'''
print df.idxmax() # 计算每一列最大值的索引
'''
one    b
two    b
'''
print df.cumsum() # 每一列的累加和
'''
   one  two
a  1.0  NaN
b  8.0  4.0
c  NaN  NaN
d  8.0  5.0
'''
print df.describe() # 对DataFrame每列计算汇总统计
'''
            one      two
count  3.000000  2.00000
mean   2.666667  2.50000
std    3.785939  2.12132
min    0.000000  1.00000
25%         NaN      NaN
Exemple #18
0
#Notice how it ignores NaN values
dframe1.sum(axis=1)

#Can also grab min and max values of dataframe
dframe1.min()

#As well as there index
dframe1.idxmin()

dframe1.idxmax()

dframe1.max()

dframe1
#Can also do an accumulation sum
dframe1.cumsum()

#A very useful feature is describe, which provides summary statistics
describe=dframe1.describe()

# We can also get information on correlation and covariance

#For more info on correlation and covariance, check out the videos below!
from IPython.display import YouTubeVideo
YouTubeVideo('xGbpuFNR1ME')

#Now lets check correlation and covariance on some stock prices!

#Pandas can get info off the web
import pandas_datareader as pdweb #workaround pandas ver 0.24.2
import datetime
Exemple #19
0
def plotter(plot_dir: Path,
            location_id: int,
            location_name: str,
            input_data: Dict,
            sero_data: pd.DataFrame,
            ratio_model_inputs: Dict,
            cross_variant_immunity: List[int],
            escape_variant_prevalence: pd.Series,
            output_data: Dict,
            smooth_infections: pd.Series,
            output_draws: pd.DataFrame,
            population: float,
            measures: List[str] = ['cases', 'hospitalizations', 'deaths']):
    start_date, end_date = get_dates(input_data, output_data, output_draws)

    n_cols = 3
    n_rows = 12
    widths = [2, 1, 2]
    heights = [1] * n_rows

    sns.set_style('whitegrid')
    fig = plt.figure(figsize=(16, 9), constrained_layout=True)
    gs = fig.add_gridspec(n_rows,
                          n_cols,
                          width_ratios=widths,
                          height_ratios=heights)

    # line1 = plt.Line2D((0.41, 0.41),(0., 0.975), color='darkgrey', linewidth=2)
    # line2 = plt.Line2D((0.65, 0.65),(0., 0.975), color='darkgrey', linewidth=2)
    # fig.add_artist(line1)
    # fig.add_artist(line2)

    for i, measure in enumerate(measures):
        daily_ax = fig.add_subplot(gs[i * 4:i * 4 + 4, 0])
        cumul_ax = fig.add_subplot(gs[i * 4:i * 4 + 2, 1])
        if measure in list(input_data.keys()):
            if i == 0:
                daily_title = 'Daily'
                cumul_title = 'Cumulative (in thousands)'
            else:
                daily_title = None
                cumul_title = None
            daily_title = None
            cumul_title = None
            data_plot(daily_ax, measure.capitalize(), 'Daily',
                      input_data[measure]['daily'][1:],
                      output_data[measure]['daily'][1:],
                      MEASURE_COLORS[measure]['light'],
                      MEASURE_COLORS[measure]['dark'], start_date, end_date,
                      measure == measures[-1])

            data_plot(cumul_ax, None, 'Cumulative',
                      input_data[measure]['cumul'],
                      output_data[measure]['cumul'],
                      MEASURE_COLORS[measure]['light'],
                      MEASURE_COLORS[measure]['dark'], start_date, end_date)
        else:
            daily_ax.axis('off')
            cumul_ax.axis('off')

    ratio_names = {'deaths': 'IFR', 'hospitalizations': 'IHR', 'cases': 'IDR'}
    for i, measure in enumerate(measures):
        ratio_ax = fig.add_subplot(gs[i * 4 + 2:i * 4 + 4, 1])
        if measure in list(input_data.keys()):
            adj_ratio = smooth_infections.copy()
            adj_ratio.index += pd.Timedelta(
                days=int(np.mean(input_data[measure]['lags'])))
            adj_ratio = (output_data[measure]['daily'] *
                         input_data[measure]['scalar'].mean()) / adj_ratio
            adj_ratio = adj_ratio.dropna()
            ratio_data = pd.concat([
                input_data[measure]['ratio'].groupby(level=1).mean(),
                input_data[measure]['daily']
            ],
                                   axis=1).dropna()['ratio']
            ratio_data_fe = pd.concat([
                input_data[measure]['ratio'].groupby(level=1).mean(),
                input_data[measure]['daily']
            ],
                                      axis=1).dropna()['ratio_fe']
            ratio_plot_range = pd.concat([
                ratio_data, ratio_data_fe,
                ratio_model_inputs[measure]['ratio_mean']
            ])
            ratio_plot_range = ratio_plot_range.replace((-np.inf, np.inf),
                                                        np.nan).dropna()
            ratio_plot_range_min = ratio_plot_range.min()
            ratio_plot_range_max = ratio_plot_range.max()
            ratio_plot_lims = (max(
                0, ratio_plot_range_min - ratio_plot_range_max * 0.2),
                               min(
                                   1, ratio_plot_range_max +
                                   ratio_plot_range_max * 0.2))
            # if ratio_names[measure] == 'IFR':
            #     adj_ratio[adj_ratio < ratio_plot_lims[0]] = np.nan
            #     adj_ratio[adj_ratio > ratio_plot_lims[1]] = np.nan
            # elif ratio_names[measure] == 'IHR':
            #     adj_ratio[adj_ratio < ratio_plot_lims[0]] = np.nan
            #     adj_ratio[adj_ratio > ratio_plot_lims[1]] = np.nan
            # elif ratio_names[measure] == 'IDR':
            #     adj_ratio[adj_ratio < 0] = np.nan
            #     adj_ratio[adj_ratio > 1] = np.nan
            # else:
            #     raise ValueError('Unexpected ratio present in plotting.')
            ratio_plot(ratio_ax, ratio_plot_lims, ratio_names[measure],
                       ratio_data, ratio_data_fe, adj_ratio,
                       ratio_model_inputs[measure],
                       MEASURE_COLORS[measure]['light'],
                       MEASURE_COLORS[measure]['dark'], start_date, end_date,
                       measure == measures[-1])
        else:
            ratio_ax.axis('off')

    model_measures = [m for m in measures if m in list(output_data.keys())]
    #whitespace_top = fig.add_subplot(gs[0:1, 2])
    #whitespace_top.axis('off')
    gs[i * 4:i * 4 + 4, 0]
    dailymodel_ax = fig.add_subplot(gs[0:4, 2])
    infection_daily_data = {
        mm: pd.concat(output_data[mm]['infections_daily'],
                      axis=1).dropna().mean(axis=1)[1:]
        for mm in model_measures
    }
    model_plot(dailymodel_ax, 'Infections', 'Daily infections',
               infection_daily_data, None,
               smooth_infections.dropna()[1:],
               output_draws.dropna()[1:], start_date, end_date, False)
    #whitespace_mid = fig.add_subplot(gs[5:7, 2])
    #whitespace_mid.axis('off')

    cumul_infections_measures = {
        mm: (pd.concat(output_data[mm]['infections_cumul'],
                       axis=1).dropna().mean(axis=1) / population) * 100
        for mm in model_measures
    }
    cumul_infections_draws = output_draws.cumsum().dropna()
    cumul_infections_point = smooth_infections.cumsum().dropna()

    cumulinfmodel_ax = fig.add_subplot(gs[4:8, 2])
    model_plot(cumulinfmodel_ax, None, 'Cumulative infections (%)',
               cumul_infections_measures, sero_data,
               (cumul_infections_point / population) * 100,
               (cumul_infections_draws / population) * 100, start_date,
               end_date, False)

    expand_dates = [
        date for date in pd.date_range('2019-11-01', end_date)
        if not date in escape_variant_prevalence.index
    ]
    if expand_dates:
        date_idx = pd.Index(expand_dates, name='date')
        if isinstance(escape_variant_prevalence, pd.Series):
            escape_variant_prevalence = pd.concat([
                pd.Series(np.nan,
                          index=date_idx,
                          name=escape_variant_prevalence.name),
                escape_variant_prevalence
            ])
            escape_variant_prevalence = escape_variant_prevalence.fillna(
                method='bfill').fillna(method='ffill')
        elif escape_variant_prevalence.empty:
            escape_variant_prevalence = pd.Series(
                np.nan, index=date_idx, name='escape_variant_prevalence')
    cumul_infected_measures = {
        mm: pd.concat(output_data[mm]['infections_daily'],
                      axis=1).dropna().mean(axis=1)
        for mm in model_measures
    }
    cumul_infected_measures = {
        mm: (calc_infected(
            mm_data,
            escape_variant_prevalence.loc[mm_data.index].values,
            np.mean(cross_variant_immunity),
            population,
        ).dropna() / population) * 100
        for mm, mm_data in cumul_infected_measures.items()
    }
    cumul_infected_draws = calc_infected(
        output_draws,
        escape_variant_prevalence.loc[output_draws.index].to_frame().values,
        np.array(cross_variant_immunity),
        population,
    ).dropna()
    cumul_infected_point = calc_infected(
        smooth_infections,
        escape_variant_prevalence.loc[smooth_infections.index].values,
        np.mean(cross_variant_immunity),
        population,
    ).dropna()
    del output_draws, smooth_infections

    #     if not daily_reinfection_rr.empty:
    #         sero_data = sero_data.join(daily_reinfection_rr, how='left')
    #         sero_data['inflation_factor'] = sero_data['inflation_factor'].fillna(1)
    #         sero_data['seroprev_mean_no_vacc_waning'] /= sero_data['inflation_factor']
    #         del sero_data['inflation_factor']

    cumulpropmodel_ax = fig.add_subplot(gs[8:12, 2])
    model_plot(
        cumulpropmodel_ax,
        None,
        'Cumulative infected (%)',
        cumul_infected_measures,
        None,  # sero_data,
        (cumul_infected_point / population) * 100,
        (cumul_infected_draws / population) * 100,
        start_date,
        end_date,
        True)
    #whitespace_bottom = fig.add_subplot(gs[11:12, 2])
    #whitespace_bottom.axis('off')

    fig.suptitle(f'{location_name} ({location_id})', fontsize=20)
    plt.tight_layout()
    if plot_dir is not None:
        plt.switch_backend('pdf')
        fig.savefig(plot_dir / f'{location_id}.pdf')
        plt.close(fig)
    else:
        plt.show()
Exemple #20
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

from numpy import nan as NA

### Descriptive statistics
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

df.sum()
df.sum(axis=1)  # NB for this one NaNs are treated at 0

df.cumsum()

df.mean(axis=1, skipna=False)

df.describe()  # also works on other objects

df.idxmax()  # returns the id of the index of the max

### Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
string_data[0] = None
string_data.isnull()

data = Series([1, NA, 3.5, NA, 7])
def make_overview_chart(
    series: pd.DataFrame, title: str, subtitle_base: str = "Log Returns"
) -> go.Figure:

    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=[
            subtitle_base,
            f"{subtitle_base} Distribution",
            f"Cumulative {subtitle_base}",
            f"Q/Q Plot",
        ],
        vertical_spacing=0.09,
        horizontal_spacing=0.08,
    )

    # Returns Distribution
    series_cuts = pd.cut(series, 100).value_counts().sort_index()
    midpoints = series_cuts.index.map(lambda interval: interval.right).to_numpy()
    norm_dist = stats.norm.pdf(midpoints, loc=series.mean(), scale=series.std())

    fig.add_trace(
        go.Scatter(
            x=series.index,
            y=series,
            line=dict(width=1, color=COLORS[0]),
            name="return",
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=series.index,
            y=series.cumsum(),
            line=dict(width=1, color=COLORS[0]),
            name="cum. return",
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Bar(
            x=[interval.mid for interval in series_cuts.index],
            y=series_cuts / series_cuts.sum(),
            name="pct. of returns",
            marker=dict(color=COLORS[0]),
        ),
        row=1,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=[interval.mid for interval in series_cuts.index],
            y=norm_dist / norm_dist.sum(),
            name="normal",
            line=dict(width=1, color=COLORS[1]),
        ),
        row=1,
        col=2,
    )

    # Q/Q Data
    returns_norm = ((series - series.mean()) / series.std()).sort_values()
    norm_dist = pd.Series(
        list(map(stats.norm.ppf, np.linspace(0.001, 0.999, len(series)))),
        name="normal",
    )

    fig.append_trace(
        go.Scatter(
            x=norm_dist,
            y=returns_norm,
            name="return norm.",
            mode="markers",
            marker=dict(color=COLORS[0], size=3),
        ),
        row=2,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=norm_dist,
            y=norm_dist,
            name="norm.",
            line=dict(width=1, color=COLORS[1]),
        ),
        row=2,
        col=2,
    )

    fig.add_annotation(
        text=(f"{series.cumsum()[-1] * 100:0.1f}%"),
        xref="paper",
        yref="y3",
        x=0.465,
        y=series.cumsum()[-1],
        xanchor="left",
        showarrow=False,
        align="left",
    )

    fig.add_annotation(
        get_moments_annotation(
            series.dropna(),
            xref="paper",
            yref="paper",
            x=0.55,
            y=0.45,
            xanchor="left",
            title="Returns",
            labels=IS_labels,
        ),
        font=dict(size=6, family="Courier New, monospace"),
    )

    fig.update_xaxes(showline=True, linewidth=1, linecolor="black", mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor="black", mirror=True)

    fig.update_layout(
        title_text=(
            f"{title}<br>"
            f"{series.index.min().strftime('%Y-%m-%d %H:%M')}"
            f" - {series.index.max().strftime('%Y-%m-%d %H:%M')}"
        ),
        showlegend=False,
        height=600,
        font=dict(size=10),
        margin=dict(l=50, r=50, b=50, t=100),
        yaxis=dict(tickformat="0.3f"),
        yaxis3=dict(tickformat="0.3f"),
        yaxis2=dict(tickformat="0.3f"),
        yaxis4=dict(tickformat="0.1f"),
        xaxis2=dict(tickformat="0.3f"),
        xaxis4=dict(tickformat="0.1f"),
    )

    for i in fig["layout"]["annotations"]:
        i["font"]["size"] = 12

    fig.update_annotations(font=dict(size=10))

    return fig
Exemple #22
0
from pandas import Series, DataFrame
import pandas as pd
import pandas_datareader.data as web
import numpy as np

df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])

print(df)
print(df.sum())
print(df.sum(axis=1))
print(df.idxmax())
print(df.cumsum())
print(df.describe())

obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    print("get data:" + ticker)
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2010', '1/30/2010')
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

returns = price.pct_change()
print(returns.tail())
print(returns.MSFT.corr(returns.IBM))
print(returns.MSFT.cov(returns.IBM))
print(returns.corr())
# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb
# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
                              start=datetime.datetime(2010, 1, 1),
                              end=datetime.datetime(2013, 1, 1))['Adj Close']
prices.head()

volume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
from pandas import DataFrame

df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df)

print(df.sum())  #by columns

print(df.sum(axis=1))  #by rows

print(df.mean(axis=1, skipna=False))

print(df.idxmax())  #index value where the max value

print(df.cumsum())  #accumulations

print(df.describe())

obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())

#Correlation and Covariance

import pandas.io.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

price = DataFrame(
#CORRECT
frame.apply(lambda x: x.max() - x.min() )


##Map elementwise transformation over a Series


### ApplyMap Elmentwise transformation over a DF



# Summary functions

frame.mean(axis=1) # agg accross the colums
frame.idxmax()
frame.cumsum()
frame.describe()

pct = frame.pct_change()

#format to 2dp
pct.applymap(lambda x: '%.2f' % x)

#newer python version
pct.applymap(lambda x: '{0:.2%}'.format(x))



#value counts - calculate across all columns simultanesiously
# fillna replaces nans with 0
Exemple #26
0

df1.max()#maximum value along each column


# In[78]:


df1.idxmax() #maximum index


# In[79]:


#cummulative sum
df1.cumsum()


# In[80]:


df1.describe() #statistical description of dataset


# In[81]:


df2 = DataFrame(randn(9).reshape(3,3),index=[1,2,3],columns=list('ABC'))
df2

Exemple #27
0
def add_values(
    log: pd.DataFrame, changes: pd.DataFrame, cashes: pd.DataFrame
) -> pd.DataFrame:
    """Creates a new df with performance results

    Parameters
    ----------
    log : pd.DataFrame
        The dataframe that will have daily holdings
    changes : pd.DataFrame
        Transactions that changed holdings
    cashes : pd.DataFrame
        Cash changing transactions

    Returns
    ----------
    log : pd.DataFrame
        A dataframe with daily holdings
    """
    for index, _ in log.iterrows():
        # Add stocks to dataframe
        values = changes[changes["Date"] == index]
        if len(values.index) > 0:
            for _, sub_row in values.iterrows():
                ticker = sub_row["Name"]
                quantity = sub_row["Quantity"]
                price = sub_row["Price"]
                fees = sub_row["Fees"]
                if math.isnan(fees):
                    fees = 0
                sign = -1 if sub_row["Side"].lower() == "sell" else 1
                pos1 = log.cumsum().at[index, ("Quantity", ticker)] > 0
                pos2 = (quantity * sign) > 0

                if sub_row["Side"].lower() == "interest":
                    log.at[index, ("Cost Basis", ticker)] = (
                        log.at[index, ("Cost Basis", ticker)] + quantity * price
                    )
                    log.at[index, ("Cash", "Cash")] = log.at[
                        index, ("Cash", "Cash")
                    ] - (quantity * price)

                elif (
                    pos1 == pos2
                    or log.cumsum().at[index, ("Quantity", ticker)] == 0
                    or (quantity * sign) == 0
                ):
                    log.at[index, ("Quantity", ticker)] = (
                        log.at[index, ("Quantity", ticker)] + quantity * sign
                    )
                    log.at[index, ("Cost Basis", ticker)] = (
                        log.at[index, ("Cost Basis", ticker)]
                        + fees
                        + quantity * sign * price
                    )
                    log.at[index, ("Cash", "Cash")] = log.at[
                        index, ("Cash", "Cash")
                    ] - (fees + quantity * sign * price)
                else:
                    rev = (
                        log.at[index, ("Profit", ticker)] + quantity * sign * price * -1
                    )
                    wa_cost = (
                        quantity / log.cumsum().at[index, ("Quantity", ticker)]
                    ) * log.cumsum().at[index, ("Cost Basis", ticker)]
                    log.at[index, ("Profit", ticker)] = rev - wa_cost - fees
                    log.at[index, ("Cash", "Cash")] = (
                        log.at[index, ("Cash", "Cash")] + rev - fees
                    )
                    log.at[index, ("Quantity", ticker)] = (
                        log.at[index, ("Quantity", ticker)] + quantity * sign
                    )
                    log.at[index, ("Cost Basis", ticker)] = (
                        log.at[index, ("Cost Basis", ticker)] - wa_cost
                    )
        cash_vals = cashes[cashes["Date"] == index]
        if len(cash_vals.index) > 0:
            for _, sub_row in cash_vals.iterrows():
                amount = sub_row["Price"]
                quantity = sub_row["Quantity"]
                if sub_row["Side"] == "deposit":
                    d = 1
                elif sub_row["Side"] == "withdrawal":
                    d = -1
                else:
                    raise ValueError("Cash type must be deposit or withdrawal")
                log.at[index, ("Cash", "Cash")] = (
                    log.at[index, ("Cash", "Cash")] + d * amount * quantity
                )
                log.at[index, ("Cash", "User")] = (
                    log.at[index, ("Cash", "User")] + d * amount * quantity
                )
    return log
 def _cumulative_returns(returns: pd.DataFrame, is_log: bool):
     return returns.cumsum() if is_log else returns.add(1).cumprod().sub(1)
#按照行进行排序
df5.sort_index(axis=1)
df6 = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
#按照a b进行排序
df6.sort_index(by=['a','b'])

#基础统计功能
df7 = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df7.sum()
df7.sum(axis=1)
#包含NaN的值,不进行平均值计算
df7.mean(axis=1,skipna=False)
#最大值所在的索引
df7.idxmax()
#返回累加和
df7.cumsum()
#返回多种统计集合的结果
df7.describe()

#唯一值和值计数
obj = Series(['c','a','d','a','a','b','b','c','c'])
unique = obj.unique()
obj.value_counts()
pd.value_counts(obj.values,sort=True)
mask = obj.isin(['b','c'])
obj[mask]

#缺失数据判断
data8 = Series(['a','b',np.nan,'d'])
data8.isnull()
data8[2] = None
Exemple #30
0
pd.isnull(df)
df.isnull()

df.sum()
df.sum(axis = 1)
df.mean()
df.mean(skipna = False)
df.mean(axis = 1)
df.mean(axis = 1, skipna = False)

np.mean(df, axis = 1)

df.idxmax()  # 열기준 최고값 인덱스 : 과목별 고득점자
df.idxmin()  # 열기준 최소값 인덱스 : 과목별 저득점자

df.cumsum()  # row단위 누적합
df.cumsum(axis = 1)  # col단위 누적합

df['영어'].sum()
df['영어'].mean()
df['영어'].var()
df['영어'].std()
df['영어'].max()
df['영어'].min()

df.loc['홍길동'].sum()
df.loc['박찬호'].mean()
df.describe()


'''
Exemple #31
0
from pandas import Series, DataFrame, date_range
from numpy.random import *
from datetime import datetime
import matplotlib.pyplot as plt

seed(123456)

## basic 1
ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
plt.show()

## basic 2
df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD'))
df = df.cumsum()
df.plot(use_index=True)
plt.show()

## basic 3
df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum()
df3['A'] = Series(list(range(len(df3))))
df3.plot(x='A', y='B')
plt.show()

## barplot1
df.ix[5].plot(kind='bar')
plt.axhline(0, color='k')
plt.show()

## barplot2
Two      2.0
Three    4.0
dtype: float64
'''

# index of min value
print(dframe1.idxmin())
'''
One      A
Two      A
Three    B
dtype: object
'''

# acumulation
print(dframe1.cumsum())
'''
   One  Two  Three
A  1.0  2.0    NaN
B  NaN  5.0    4.0
'''

# describe
print(dframe1)
'''
   One  Two  Three
A  1.0  2.0    NaN
B  NaN  3.0    4.0
'''
print(dframe1.describe())
'''
Exemple #33
0
def plotter(df,
            title=False,
            kind='line',
            x_label=None,
            y_label=None,
            style='ggplot',
            figsize=(8, 4),
            save=False,
            legend_pos='best',
            reverse_legend='guess',
            num_to_plot=6,
            tex='try',
            colours='default',
            cumulative=False,
            pie_legend=True,
            partial_pie=False,
            show_totals=False,
            transparent=False,
            output_format='png',
            black_and_white=False,
            show_p_val=False,
            indices=False,
            transpose=False,
            rot=False,
            **kwargs):
    """Visualise corpus interrogations.
    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: Pandas DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :returns: matplotlib figure
    """

    
    kwargs['rot'] = rot

    xtickspan = kwargs.pop('xtickspan', False)

    # if the data was multiindexed, the default is a little different!
    if isinstance(df.index, MultiIndex):
        import matplotlib.pyplot as nplt
        shape = kwargs.get('shape', 'auto')
        truncate = kwargs.get('truncate', 8)
        if shape == 'auto':
            shape = (int(len(df.index.levels[0]) / 2), 2)
        f, axes = nplt.subplots(*shape)
        for i, ((name, data), ax) in enumerate(zip(df.groupby(level=0), axes.flatten())):
            data = data.loc[name]
            if isinstance(truncate, int) and i > truncate:
                continue
            if kwargs.get('name_format'):
                name = kwargs.get('name_format').format(name)
            data.chart(
            title=name,
            ax=ax,
            kind=kind,
            x_label=x_label,
            y_label=y_label,
            style=style,
            figsize=figsize,
            save=save,
            legend_pos=legend_pos,
            reverse_legend=reverse_legend,
            num_to_plot=num_to_plot,
            tex=tex,
            colours=colours,
            cumulative=cumulative,
            pie_legend=pie_legend,
            partial_pie=partial_pie,
            show_totals=show_totals,
            transparent=transparent,
            output_format=output_format,
            black_and_white=black_and_white,
            show_p_val=show_p_val,
            indices=indices,
            transpose=transpose,
            rot=rot)
        return nplt

    title = title or ""

    # get a few options from kwargs
    sbplt = kwargs.get('subplots', False)
    show_grid = kwargs.pop('grid', True)
    the_rotation = kwargs.get('rot', False)
    dragmode = kwargs.pop('draggable', False)
    leg_frame = kwargs.pop('legend_frame', True)
    leg_alpha = kwargs.pop('legend_alpha', 0.8)
    # auto set num to plot based on layout
    lo = kwargs.get('layout', None)
    if lo:
        num_to_plot = lo[0] * lo[1]

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making
    kwargs['kind'] = kind.lower()

    # find out if pie mode, add autopct format
    piemode = kind == "pie"
    if piemode:
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    if kind == 'heatmap':
        try:
            dataframe = dataframe.T
        except:
            pass
    was_series = isinstance(dataframe, Series)
    if was_series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if transpose:
            dataframe = dataframe.T
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            indices = all([isint(x) for x in list(dataframe.columns)])

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series=was_series, 
                                        num_to_plot=num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', True)

    if not was_series:
        if transpose:
            dataframe = dataframe.head(num_to_plot)
        else:
            dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis=1, errors='ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False

    if show_p_val and there_are_p_vals:
        newnames = []
        for col in list(dataframe.columns):
            pval = dataframe[col]['p']
            pstr = p_string_formatter(pval, using_tex)
            newname = '%s (%s)' % (col, pstr)
            newnames.append(newname)
        dataframe.columns = newnames
        dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')

    elif there_are_p_vals:
        dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')

    # make and set y label
    absolutes = True
    if isinstance(dataframe, DataFrame):
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    ##########################################
    ################ COLOURS #################
    ##########################################

    # set defaults, with nothing for heatmap yet
    if colours is True or colours == 'default' or colours == 'Default':
        if kind != 'heatmap':
            colours = 'viridis'
        else:
            colours = 'default'
    
    # assume it's a single color, unless string denoting map
    cmap_or_c = 'color'
    if isinstance(colours, str):
        cmap_or_c = 'colormap'
    from matplotlib.colors import LinearSegmentedColormap
    if isinstance(colours, LinearSegmentedColormap):
        cmap_or_c = 'colormap'

    # for heatmaps, it's always a colormap
    if kind == 'heatmap':
        cmap_or_c = 'cmap'
        # if it's a defaulty string, set accordingly
        if isinstance(colours, str):
            if colours.lower().startswith('diverg'):
                colours = sns.diverging_palette(10, 133, as_cmap=True)

            # if default not set, do diverge for any df with a number < 0
            elif colours.lower() == 'default':
                mn = dataframe.min()
                if isinstance(mn, Series):
                    mn = mn.min()
                if mn < 0:
                    colours = sns.diverging_palette(10, 133, as_cmap=True)
                else:
                    colours = sns.light_palette("green", as_cmap=True)

    if 'seaborn' not in style:
        kwargs[cmap_or_c] = colours

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title

    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot // 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': leg_alpha,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if isinstance(legend_pos, int):
                the_loc = legend_pos
            elif isinstance(legend_pos, str):
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if isinstance(legend_pos, str):
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series=was_series, 
                                           using_tex=using_tex, 
                                           absolutes=absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series=was_series, 
                                           using_tex=using_tex, 
                                           absolutes=absolutes)

    if piemode and partial_pie:
        dataframe = dataframe / 100.0

    # some pie things
    if piemode and not sbplt:
        kwargs['y'] = list(dataframe.columns)[0]

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    if kwargs.get('filled'):
        if areamode or kind.startswith('bar'):
            dataframe = filler(dataframe)
        kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
    }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
    }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs[cmap_or_c] = new_cmap

    # remove things from kwargs if heatmap
    if kind == 'heatmap':
        number_format = ".2f"
        if all(dataframe[i].astype(str).str.isdigit().all() for i in list(dataframe.columns)):
            number_format = None
        hmargs = {'annot': kwargs.pop('annot', True),
              cmap_or_c: kwargs.pop(cmap_or_c, None),
              'cbar': kwargs.pop('cbar', False)}
        if number_format:
            hmargs['fmt'] = number_format

        for i in ['vmin', 'vmax', 'linewidths', 'linecolor',
                  'robust', 'center', 'cbar_kws', 'cbar_ax',
                  'square', 'mask', 'norm']:
            if i in kwargs.keys():
                hmargs[i] = kwargs.pop(i, None)

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        kwargs.pop('filled', None)

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if not kwargs.get('ax'):
                    kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            if kind != 'heatmap':
                # turn off pie labels at the last minute
                if kind == 'pie' and pie_legend:
                    kwargs['labels'] = None
                    kwargs['autopct'] = '%.2f'
                if kind == 'pie':
                    kwargs.pop('color', None)
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                fg = plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = kwargs.get('ax', plt.axes())
                tmp = sns.heatmap(dataframe, ax=ax, **hmargs)
                ax.set_title(title)
                for item in tmp.get_yticklabels():
                    item.set_rotation(0)
                return tmp  # not good, but otherwise it doesn't show up!

            if areamode and not kwargs.get('ax'):
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels

            if x_label:
                ax.set_xlabel(x_label)
            if y_label:
                ax.set_ylabel(y_label)

        else:
            if not kwargs.get('layout'):
                plt.gcf().set_tight_layout(False)

            if kind != 'heatmap':
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = plt.axes()
                sns.heatmap(dataframe, ax=ax, **hmargs)
                plt.xticks(rotation=0)
                plt.yticks(rotation=0)
        
        if sbplt:
            if 'layout' not in kwargs:
                axes = [l for l in ax]
            else:
                axes = []
                cols = [l for l in ax]
                for col in cols:
                    for bit in col:
                        axes.append(bit)
            for index, a in enumerate(axes):
                if xtickspan is not False:
                    a.xaxis.set_major_locator(ticker.MultipleLocator(xtickspan))
                labels = [item.get_text() for item in a.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)                
                try:
                    if the_rotation == 0:
                        ax.set_xticklabels(labels, rotation=rotation, ha='center')
                    else:
                        ax.set_xticklabels(labels, rotation=rotation, ha='right')
                except AttributeError:
                    pass
        else:
            if kind == 'heatmap':
                labels = [item.get_text() for item in ax.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)
                if the_rotation == 0:
                    ax.set_xticklabels(labels, rotation=rotation, ha='center')
                else:
                    ax.set_xticklabels(labels, rotation=rotation, ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white and kind == 'line':
            # white background
            # change everything to black and white with interesting dashes and markers
            c = 0
            for line in ax.get_lines():
                line.set_color('black')
                #line.set_width(1)
                line.set_dashes(COLORMAP[c]['dash'])
                line.set_marker(COLORMAP[c]['marker'])
                line.set_markersize(MARKERSIZE)
                c += 1
                if c == len(list(COLORMAP.keys())):
                    c = 0

        # draw legend with proper placement etc
        if legend and not piemode and not sbplt and kind != 'heatmap':
            handles, labels = plt.gca().get_legend_handles_labels()
            # area doubles the handles and labels. this removes half:
            #if areamode:
            #    handles = handles[-len(handles) / 2:]
            #    labels = labels[-len(labels) / 2:]
            if rev_leg:
                handles = handles[::-1]
                labels = labels[::-1]
            if kwargs.get('ax'):
                lgd = plt.gca().legend(handles, labels, **leg_options)
                ax.get_legend().draw_frame(leg_frame)
            else:
                lgd = plt.legend(handles, labels, **leg_options)
                lgd.draw_frame(leg_frame)

    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    # if isinstance(dataframe.index, pandas.tseries.period.PeriodIndex):
    #    x_label = 'Year'

    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            #try:
            #    from matplotlib.ticker import MaxNLocator
            #    from corpkit.process import is_number
            #    indx = list(dataframe.index)
            #    if all([is_number(qq) for qq in indx]):
            #        ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))
            #except:
            #    pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            a.grid(b=show_grid)
        
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        
        # show grid
        ax.grid(b=show_grid)

        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')        

    if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'):
        plt.tight_layout()
    if kwargs.get('ax'):
        try:
            plt.gcf().set_tight_layout(False)
        except:
            pass
        try:
            plt.set_tight_layout(False)
        except:
            pass

    if save:
        imagefolder = 'images'

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o') and not sbplt:
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), 
                              bbox_inches='tight', format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    return plt
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
Exemple #35
0
 def cumsum(df: pd.DataFrame) -> pd.DataFrame:
     return df.cumsum()
Exemple #36
0
# sorting and ranking
frame_sort = frame.sort_index()
print(frame_sort)

frame_sort = frame.sort_values(by='b')
print(frame_sort)

# Summarizing and computing describtive statistics
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])

print(df.sum(axis=0))
print(df.mean(axis=1))
print(df.cumsum(axis=0))

# correlation and covariance
df = DataFrame(np.random.randn(100, 4),
               columns=['AAPL', 'GOOG', 'IBM', 'MSFT'])
df_corr = df.corr()
df_cov = df.cov()

print(df_corr)
print(df_cov)

# Unique values, values counts and membership
data = DataFrame(np.random.randint(0, 5, (5, 3)), columns=list('abc'))
print(data)
result = data.apply(pd.value_counts).fillna(0)
print(result)
Exemple #37
0
class Portfolio(object):

    """This class represents portfolio and its events."""

    def __repr__(self):
        return '<Portfolio {}>'.format(self.prices.shape)

    def __init__(self, ohlcs, starting_capital=100000, price_type='cprices',
                 transaction_fee_bps=15., transaction_fee_min=7):
        self.price_type = price_type
        self.transaction_fee_bps = transaction_fee_bps
        self.transaction_fee_min = transaction_fee_min
        self.prices = self.from_ohlcs(ohlcs, price_type)
        self.volumes = self.from_ohlcs(ohlcs, 'volumes')
        self.trades = DataFrame(zeros(self.prices.shape), self.prices.index,
                                self.prices.columns)
        self.fees = DataFrame(zeros(self.prices.shape), self.prices.index,
                              self.prices.columns)
        self.starting_capital = starting_capital
        self.capital = []
        self.quantities = []
        self.values = []
        self.refresh()

    def from_ohlcs(self, ohlcs, price_type):
        """Set prices using a list of ohlc classes."""
        dfs = []
        for ohlc in ohlcs:
            df = DataFrame(getattr(ohlc, price_type),
                           posix_as_dt(ohlc.timestamps))
            dfs.append(df)
        prices = concat(dfs, join='outer', axis=1)
        prices.columns = [ohlc.symbol for ohlc in ohlcs]
        return prices.fillna(method='pad')

    def refresh(self):
        """Calculates positions, values, free capital and costs from trades.
        Fees of short positions (if any) are same as cost for long. This is not
        realistic, but the class is intended to represent long only portfolios.
        """
        self.fees = self.transaction_fee_bps * self.trades.abs() * \
            self.prices / 10000
        small = self.fees < self.transaction_fee_min
        nonzero = self.trades.abs() > 0
        self.fees[small * nonzero] = self.transaction_fee_min
        self.quantities = self.trades.cumsum()
        self.values = self.quantities * self.prices
        self.capital = self.starting_capital + self.total_trade_values - \
            self.total_fees

    def trade(self, timestring, symbol, quantity):
        """Convenience function to enter trades and refresh."""
        self.trades[symbol][timestring] = quantity
        self.refresh()

    def trade_max(self):
        """Trade all capital on first day, equal sized positions."""
        first_day = dt_as_str(self.prices.index[0])
        trade_sizes = zeros(len(self.prices.columns))
        trade_sizes[:] = self.starting_capital / float(len(trade_sizes))
        trade_sizes = [trade_sizes[ind] / self.prices.iloc[0].values[ind] for
                       ind in arange(len(trade_sizes))]
        trade_sizes = trunc(trade_sizes)
        for ind in arange(len(self.prices.columns)):
            self.trade(first_day, self.prices.columns[ind], trade_sizes[ind])
        self.refresh()

    @property
    def market_value(self):
        """Value of equity positions at each time."""
        return self.values.sum(axis=1)

    @property
    def total_value(self):
        """Total value of portfolio at each time."""
        return self.market_value + self.capital

    @property
    def trade_values(self):
        """Trade values for each trade."""
        tvals = -self.trades * self.prices
        to_0 = (tvals == 0) + (isnull(tvals))
        tvals[to_0] = 0
        return tvals

    @property
    def total_trade_values(self):
        """Cumulative sum of all trades."""
        return self.trade_values.sum(axis=1).cumsum()

    @property
    def total_fees(self):
        """Cumulative sum of fees."""
        return self.fees.sum(axis=1).cumsum()
Exemple #38
0
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
for tic, data in all_data.iteritems()})
price
Exemple #39
0
print(team_df.sum())
print("----")
print(team_df.mean())
print("----")

# ### 팀별 요약값을 보고 싶다.

# In[33]:

team_df.describe()

# In[34]:

## 날짜별 누적 통계
team_df.cumsum()

# ### 날짜별 합계

# In[35]:

## 날짜별 합계
print(team_df.sum(axis=1))

# In[36]:

rowsum = team_df.sum(axis=1)
print(type(rowsum))

# In[37]:
import matplotlib.pyplot as plt

array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]])
print array1
df1 = DataFrame(array1, index=[1, 2], columns=list('ABC'))
print df1

#sum()
print "Sum of cols", df1.sum()  #sums along each column
print df1.sum(axis=1)  #sum along indexes

print "Min", df1.min()
print "Max", df1.max()

print df1.idxmax()
print df1.cumsum()
print df1.describe()

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print df2

plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig('samplepic.png')
plt.show()

series1 = Series(list('abcccaabd'))
print series1.unique()

print series1.value_counts()
Exemple #41
0
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')

# handle overlapping
boys = DataFrame({'k': ['k0', 'k1', 'k2'], 'age': [1, 2, 3]})
girls = DataFrame({'k': ['k0', 'k0', 'k3'], 'age': [4, 5, 6]})

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')

# pandas plot

# Series
data = Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()

# DataFrame
data = DataFrame(np.random.randn(1000, 4),
                 index=np.arange(1000),
                 columns=list('abcd'))

data = data.cumsum()
print(data.head())
data.plot()
plt.show()

# plot methods: bar, hist, box, kde, area, scatter, hexbin, pie
ax = data.plot.scatter(x='a', y='b', color='DarkBlue', label='Class 1')
Exemple #42
0
def plotter(title,
            df,
            kind = 'line',
            x_label = None,
            y_label = None,
            style = 'ggplot',
            figsize = (8, 4),
            save = False,
            legend_pos = 'best',
            reverse_legend = 'guess',
            num_to_plot = 7,
            tex = 'try',
            colours = 'Accent',
            cumulative = False,
            pie_legend = True,
            partial_pie = False,
            show_totals = False,
            transparent = False,
            output_format = 'png',
            interactive = False,
            black_and_white = False,
            show_p_val = False,
            indices = False,
            **kwargs):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass   
    
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    
    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s
        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True
    kwargs['subplots'] = sbplt

    if colours is True:
        colours = 'Paired'

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']
    #if style not in styles:
        #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if style is not False and style.startswith('seaborn'):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get('draggable', False)
    kwargs.pop('draggable', None)

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)):
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
            except:
                pass
    else:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
            
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series = was_series, 
                                        num_to_plot = num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', False)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')
        else:
            warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ['pie', 'line', 'area']:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == 'Default':
                            colours = 'Paired'
                        kwargs['colormap'] = colours
        #else:

            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours
    
    # multicoloured bar charts
    if colours:
        if kind.startswith('bar'):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np
                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs['colors'] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5


    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass
    
    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
                #if kwargs['kind'] in ['barh', 'area']:
                    #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title
        
    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': .8,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)   
    
    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize = figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize = figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize = figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1),
                bbox_transform = plt.gcf().transFigure )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)
                
        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2:]
                        labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s) # for int, long and float
        except ValueError:
            try:
                complex(s) # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter()) 
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter()) 
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'
    
    def suplabel(axis,label,label_prop=None,
                 labelpad=5,
                 ha='center',va='center'):
        ''' Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        '''
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin,ymin = min(xmin),min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation=90.
            x = xmin-float(labelpad)/dpi
            y = 0.5
        elif axis.lower() == 'x':
            rotation = 0.
            x = 0.5
            y = ymin - float(labelpad)/dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None: 
            label_prop = dict()
        plt.gcf().text(x,y,label,rotation=rotation,
                   transform=fig.transFigure,
                   ha=ha,va=va,
                   **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            #suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical')
            #plt.subplots_adjust(left=0.5)
        
        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)


    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            # show grid
            a.grid(b=kwargs.get('grid', False))
            kwargs.pop('grid', None)
    
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get('grid', False))
        kwargs.pop('grid', None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')        

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if 'layout' not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save = save, title = title, ext = output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format = output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()


    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    if not interactive and not running_python_tex and not running_spider \
        and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Exemple #43
0
def plotter(title,
            df,
            x_label = None,
            y_label = None,
            style = 'ggplot',
            figsize = (8, 4),
            save = False,
            legend_pos = 'best',
            reverse_legend = 'guess',
            num_to_plot = 7,
            tex = 'try',
            colours = 'Paired',
            cumulative = False,
            pie_legend = True,
            partial_pie = False,
            show_totals = False,
            transparent = False,
            output_format = 'png',
            interactive = False,
            black_and_white = False,
            show_p_val = False,
            indices = 'guess',
            **kwargs):
    """plot interrogator() or editor() output.

    **kwargs are for pandas first, which can then send them through to matplotlib.plot():

    http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html
    http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot

    pie_legend: False to label slices rather than give legend
    show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both'

    """

    import corpkit
    import os
    import matplotlib as mpl
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    from matplotlib import rc
    import pandas
    import pandas as pd
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from corpkit.tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    tk = check_t_kinter()

    running_python_tex = check_pytex()
    # incorrect spelling of spider on purpose
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s
        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pd.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # are we doing subplots?
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True

    if colours is True:
        colours = 'Paired'

    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight']
    if style not in styles:
        raise ValueError('Style %s not found. Use %s' % (style, ', '.join(styles)))

    if 'savepath' in kwargs.keys():
        mpl.rcParams['savefig.directory'] = kwargs['savepath']
        del kwargs['savepath']

    mpl.rcParams['savefig.bbox'] = 'tight'

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    if 'kind' not in kwargs:
        kwargs['kind'] = 'line'

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if 'kind' in kwargs:
        if kwargs['kind'] == 'pie':
            piemode = True
            # always the best spot for pie
            #if legend_pos == 'best':
                #legend_pos = 'lower left'
            if show_totals.endswith('plot') or show_totals.endswith('both'):
                kwargs['pctdistance'] = 0.6
                if using_tex:
                    kwargs['autopct'] = r'%1.1f\%%'
                else:
                    kwargs['autopct'] = '%1.1f%%'

    #if piemode:
        #if partial_pie:
            #kwargs['startangle'] = 180

    kwargs['subplots'] = sbplt

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series:
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
    else:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
            
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if 'explode' in kwargs:
        if not piemode:
            del kwargs['explode']
    if piemode:
        if 'explode' in kwargs:
            if not sbplt:
                kwargs['explode'] = auto_explode(dataframe, 
                                             kwargs['explode'], 
                                             was_series = was_series, 
                                             num_to_plot = num_to_plot)

    if 'legend' in kwargs:
        legend = kwargs['legend']
    else:
        legend = True

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1)
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']
                newname = '%s (p=%s)' % (col, format(pval, '.5f'))
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True)
        else:
            warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True)

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if 'kind' in kwargs:
                if kwargs['kind'] in ['pie', 'line', 'area']:
                    if colours:
                        if not plotting_a_totals_column:
                            if colours == 'Default':
                                colours = 'Paired'
                            kwargs['colormap'] = colours
        #else:
            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours
        #else:
            #if len(dataframe.T.columns) < 8:
                #try:
                    #del kwargs['colormap']
                #except:
                    #pass
    
    # multicoloured bar charts
    if 'kind' in kwargs:
        if colours:
            if kwargs['kind'].startswith('bar'):
                if len(list(dataframe.columns)) == 1:
                    if not black_and_white:
                        import numpy as np
                        the_range = np.linspace(0, 1, num_to_plot)
                        cmap = plt.get_cmap(colours)
                        kwargs['colors'] = [cmap(n) for n in the_range]
                    # make a bar width ... ?
                    #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5


    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if 'kind' in kwargs:
        if kwargs['kind'] in ['bar', 'barh', 'area', 'line', 'pie']:
            if was_series:
                legend = False
            if kwargs['kind'] == 'pie':
                if pie_legend:
                    legend = True
                else:
                    legend = False
        if kwargs['kind'] in ['barh', 'area']:
            if reverse_legend == 'guess':
                rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass
    
    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
                #if kwargs['kind'] in ['barh', 'area']:
                    #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45

    # no title for subplots because ugly,
    if sbplt:
        if 'title' in kwargs:
            del kwargs['title'] 
    else:
        kwargs['title'] = title
        
    # no interactive subplots yet:


    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        # kwarg options go in leg_options
        leg_options = {'framealpha': .8}
        if 'shadow' in kwargs:
            leg_options['shadow'] = True
        if 'ncol' in kwargs:
            leg_options['ncol'] = kwargs['ncol']
            del kwargs['ncol']
        else:
            if num_to_plot > 6:
                leg_options['ncol'] = num_to_plot / 7

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(possible.keys()))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)   
    
    areamode = False
    if 'kind' in kwargs:
        if kwargs['kind'] == 'area':
            areamode = True        

    if legend is False:
        kwargs['legend'] = False

    # cumulative grab first col
    if cumulative:
        kwargs['y'] = list(dataframe.columns)[0]

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kwargs['kind'] == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pd.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kwargs['kind'] == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kwargs['kind'] == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    # use styles and plot

    with plt.style.context((style)):

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize = figsize, **kwargs)
        else:
            if not piemode and not sbplt:
                ax = dataframe.plot(figsize = figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize = figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1),
                bbox_transform = plt.gcf().transFigure )
                if not tk:
                    plt.show()
                    return
        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            #plt.grid()
            plt.gca().set_axis_bgcolor('w')
            if kwargs['kind'] == 'line':
                # white background

                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(COLORMAP.keys()):
                        c = 0

        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    if not rev_leg:
                        lgd = plt.legend(**leg_options)
                    else:
                        handles, labels = plt.gca().get_legend_handles_labels()
                        lgd = plt.legend(handles[::-1], labels[::-1], **leg_options)

            #if black_and_white:
                #lgd.set_facecolor('w')

        #if interactive:
            #if legend:
                #lgd.set_title("")
        #if not sbplt:
            #if 'layout' not in kwargs:
                #plt.tight_layout()

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kwargs['kind'] == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
            # works:
            #plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i]))


        #labels = ["Point {0}".format(i) for i in range(num_to_plot)]
        #tooltip = plugins.LineLabelTooltip(lines)
        #mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines))

    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)


    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    # no offsets for numerical x and y values
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            # check if x axis can be an int
            check_x_axis = list(dataframe.index)[0]
            can_it_be_int = int(check_x_axis)
            # if so, set these things
            from matplotlib.ticker import ScalarFormatter
            plt.gca().xaxis.set_major_formatter(ScalarFormatter()) 
        except:
            pass

    # same for y axis
    try:
        # check if x axis can be an int
        check_y_axis = list(dataframe.columns)[0]
        can_it_be_int = int(check_y_axis)
        # if so, set these things
        from matplotlib.ticker import ScalarFormatter
        plt.gca().yaxis.set_major_formatter(ScalarFormatter()) 
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'
    
    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.suptitle(title, fontsize = 16)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
    
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')
    
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if 'kind' in kwargs:
            if kwargs['kind'].startswith('bar'):
                width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')        

    #if not running_python_tex:
        #plt.gcf().show()

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)
    #if 'layout' not in kwargs:
        #plt.tight_layout()



    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save = save, title = title, ext = output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format = output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print '\n' + time + ": " + savename + " created."
        else:
            raise ValueError("Error making %s." % savename)

    if not interactive and not running_python_tex and not running_spider and not tk:
        plt.show()
        return
    if running_spider or tk or sbplt:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Exemple #44
0
# -*- coding:utf-8 -*-
import numpy as np
from pandas import Series, DataFrame

print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df)
print(df.sum())  #按列求和
print(df.sum(axis=1))  # 按行求和

print('平均数')
print(df.mean(axis=1, skipna=False))
print(df.mean(axis=1))

print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())
Exemple #45
0
obj = Series(range(5),index=['a','a','b','b','c'])
obj
obj.index.is_unique
obj['a']
obj['c']
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df
df.ix['b']
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df
df.sum()
df.sum(axis=1)
df.mean(axis=1,skipna=False)
df.mean(axis=1)
df.idxmax()
df.cumsum()
df.cumsum(axis=1)
df.describe()
obj = Series(['a','a','b','c'] * 4)
obj.describe()
obj
%run Dataframe.py
from pandas_datareader import data
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = data.get_data_google(ticker,'1/1/2000','1/1/2010')
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteriterms()})
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
all_data.iteritems()
a,b in for a, b in all_data.iteritems()
a,b for a, b in all_data.iteritems()