Ejemplo n.º 1
0
def deal_outlier():
    np.random.seed(12345)
    data=DataFrame(np.random.randn(1000,4))
    print data.describe()
    print data[3][np.abs(data[3])>3]
    print data[(np.abs(data)>3).any(1)]
    data[np.abs(data)>3]=np.sign(data)*3
    print data.describe()
Ejemplo n.º 2
0
def slide_15():
    np.random.seed(12345)
    data = DataFrame(np.random.randn(1000, 4))
    print data.describe()

    col = data[3]
    print col[np.abs(col) > 3]

    print data[(np.abs(data) > 3).any(1)]
    data[np.abs(data) > 3] = np.sign(data) * 3
    print data.describe()
Ejemplo n.º 3
0
    def test_describe_no_numeric(self):
        df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8,
                        'B': ['a', 'b', 'c', 'd'] * 6})
        desc = df.describe()
        expected = DataFrame(dict((k, v.describe())
                                  for k, v in compat.iteritems(df)),
                             columns=df.columns)
        assert_frame_equal(desc, expected)

        ts = tm.makeTimeSeries()
        df = DataFrame({'time': ts.index})
        desc = df.describe()
        self.assertEqual(desc.time['first'], min(ts.index))
Ejemplo n.º 4
0
def pd_04():
    obj=Series(range(4),index=['d','a','b','c'])
    print obj
    print obj.sort_index()
    frame=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','b','c'])
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(ascending=False)
    obj1=Series([4,7,-3,2])
    print obj1.order()
    print frame.sort_index(by='b')
    print frame.sort_index(by=['a','b'])
    print frame.describe()
Ejemplo n.º 5
0
class Describe(object):

    def setup(self):
        self.df = DataFrame({
            'a': np.random.randint(0, 100, int(1e6)),
            'b': np.random.randint(0, 100, int(1e6)),
            'c': np.random.randint(0, 100, int(1e6))
        })

    def time_series_describe(self):
        self.df['a'].describe()

    def time_dataframe_describe(self):
        self.df.describe()
Ejemplo n.º 6
0
    def calcCVA(self, expectedExposure=array):
        cvaData = DataFrame()
        cvaData['t'] = self.timesteps[1:]
        cvaData['discountFactor'] = self.discountFactors
        cvaData['pd'] = self.pD[1:]
        cvaData['1-R'] = [1 - self.recoveryRate] * len(self.pD[1:])
        cvaData['exposure'] = [getTPlusFromList(expectedExposure, i, True) for i in range(len(expectedExposure))]
        cvaData['cvaPerTimeStep'] = cvaData['discountFactor'] * cvaData['pd'] * cvaData['1-R'] * cvaData['exposure']
        cva = cvaData['cvaPerTimeStep'].sum()
        cvaData.describe()
        print cvaData
        print 'CVA = ', cva
        return cva

        return True
Ejemplo n.º 7
0
    def test_iloc_getitem_doc_issue(self):

        # multi axis slicing issue with single block
        # surfaced in GH 6059

        arr = np.random.randn(6, 4)
        index = date_range('20130101', periods=6)
        columns = list('ABCD')
        df = DataFrame(arr, index=index, columns=columns)

        # defines ref_locs
        df.describe()

        result = df.iloc[3:5, 0:2]
        str(result)
        result.dtypes

        expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
                             columns=columns[0:2])
        tm.assert_frame_equal(result, expected)

        # for dups
        df.columns = list('aaaa')
        result = df.iloc[3:5, 0:2]
        str(result)
        result.dtypes

        expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
                             columns=list('aa'))
        tm.assert_frame_equal(result, expected)

        # related
        arr = np.random.randn(6, 4)
        index = list(range(0, 12, 2))
        columns = list(range(0, 8, 2))
        df = DataFrame(arr, index=index, columns=columns)

        df._data.blocks[0].mgr_locs
        result = df.iloc[1:5, 2:4]
        str(result)
        result.dtypes
        expected = DataFrame(arr[1:5, 2:4], index=index[1:5],
                             columns=columns[2:4])
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 8
0
def combine_models():

    model_nums = [1, 2, 3]
    # model_nums = [2, 3]
    model_paths = ['model%03d.y_test.csv' % i for i in model_nums]
    assert all(os.path.exists(path) for path in model_paths)
    # y1 = pd.read_csv('model001.y_test.csv').set_index('job_id')
    # y2 = pd.read_csv('model002.y_test.csv').set_index('job_id')
    # y3 = pd.read_csv('model003.y_test.csv').set_index('job_id')

    models = [pd.read_csv(path).set_index('job_id') for path in model_paths]

    path = 'all/jobs_all.csv'
    df = get_data(path)
    df_train, df_test = split_train_test(df)
    y_data = np.ones((len(df_test), len(models)), dtype=int) * -1
    y = DataFrame(y_data, columns=model_nums, index=df_test.index)

    for d in [y] + models:
        print(d.describe())
    for d in [y] + models:
        print(d.shape, len(y) - len(d), type(d))

    y_indexes = set(y.index)
    print('y_indexes: %s' % sorted(y_indexes)[:10])

    for c, d in zip(model_nums, models):
        d_indexes = set(d.index)
        print('c=%s, d_indexes: %s' % (c, sorted(d_indexes)[:10]))
        assert d_indexes.issubset(y_indexes), (len(d_indexes - y_indexes))
        y[c].loc[d.index] = d['hat']

    def func(row):
        return all(x == -1 for x in row)

    # empties = y.apply(func, axis=1)
    # print('empties: %d' % len(empties))
    # print(y[empties])

    def vote(row):
        print(row)
        for j in 1, 3, 2:
            if row[j] != -1:
                return row[j]
        # assert False, row
        return -3

    print(y.iloc[:20, :])
    y_series = y.iloc[:20, :].apply(vote, axis=1)
    assert False
    y_test = DataFrame(y_series, columns=['hat'], index=y.index)
    y_test.to_csv('%s.y_test.csv' % 'model004v', index_label='job_id')
    print(y_test.columns)
    print(y_test.describe())
    print(y_test.iloc[:10, :])
Ejemplo n.º 9
0
    def test_describe_objects(self):
        df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']})
        result = df.describe()
        expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]},
                             index=['count', 'unique', 'top', 'freq'])
        assert_frame_equal(result, expected)

        df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')})
        df.loc[4] = pd.Timestamp('2010-01-04')
        result = df.describe()
        expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'),
                                     pd.Timestamp('2010-01-04'),
                                     pd.Timestamp('2010-01-04'), 2]},
                             index=['count', 'unique', 'first', 'last', 'top',
                                    'freq'])
        assert_frame_equal(result, expected)

        # mix time and str
        df['C2'] = ['a', 'a', 'b', 'c', 'a']
        result = df.describe()
        # when mix of dateimte / obj the index gets reordered.
        expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3]
        assert_frame_equal(result, expected)

        # just str
        expected = DataFrame({'C2': [5, 3, 'a', 4]},
                             index=['count', 'unique', 'top', 'freq'])
        result = df[['C2']].describe()

        # mix of time, str, numeric
        df['C3'] = [2, 4, 6, 8, 2]
        result = df.describe()
        expected = DataFrame({"C3": [5., 4.4, 2.607681, 2., 2., 4., 6., 8.]},
                             index=['count', 'mean', 'std', 'min', '25%',
                                    '50%', '75%', 'max'])
        assert_frame_equal(result, expected)
        assert_frame_equal(df.describe(), df[['C3']].describe())

        assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe())
        assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe())
Ejemplo n.º 10
0
def messages_data(soup,message_csv):
    messages = scrape_element(soup, 'messages', '.Message')
    msg_lengths = []
    pd.set_option('display.max_colwidth', -1)
    for k, v in messages.items():
        msg_lengths.append(len(v))
        text = Series(str(np.array(v.encode('utf-8'))))
        print text
        text.to_csv(message_csv, sep=',', header=False, index=False, mode='a')
    df_msg_lgth = DataFrame(msg_lengths)
    df_msg_describe = DataFrame(df_msg_lgth.describe()).T
    cols = df_msg_describe.columns
    df_msg_describe.columns = ['msg_' + c for c in cols]
    return df_msg_describe
Ejemplo n.º 11
0
def edbSave():
    '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据'

    # 获取客户剪切板中的代码及输入的起始与结束日期
    codes = getCodeFromClipboard()
    start = sDate()
    end = eDate()

    data = w.edb(codes, start, end, "Fill=Previous")
    datachg = [d.strftime('%y-%m-%d') for d in data.Times]
    df = DataFrame(data.Data, index=data.Codes, columns=datachg).T
    print('-' * 85)
    print(df)
    print('-' * 85)
    print('统计指标:')
    print(df.describe())
    print("sum", " " * 3, str(df.sum()).split(sep="    ")[1].rjust(10))
    return df
Ejemplo n.º 12
0
def StatisticDescribeDatas():
    """
    对数据做描述性统计分析
    :return:
    """
    from matplotlib import pyplot as plt
    from pandas import DataFrame

    laborary_operation = ArchlaboraryOperation()
    laborary_operation.prepareForProjectDataTable()
    res, _ = laborary_operation.getResponse()
    if res:
        _res_data = laborary_operation.getData()
        columns = []
        datas = []
        for _h in _res_data['properties']:
            columns.append(_h.label)
        for _data_item in _res_data['records']:
            _tmp_array = []
            for _pi in _res_data['properties']:
                if _res_data['records'][_data_item][_pi.label]:
                    _val = _res_data['records'][_data_item][_pi.label].loadValueLabel()
                    _tmp_array.append(float(_val) if _val.isdigit() else _val)
            datas.append(_tmp_array)
        df = DataFrame(datas, columns=columns)
        fig = plt.figure()
        #ax = fig.add_subplot(1, 1, 1)
        ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
        _d = df.describe()
        print("echo describe ......")
        print(_d)
        ax.plot(_d, 'r--')
        for seq in range(0, len(_d)):
            ax.text(seq, _d.values[seq][0], "%s(%s)" % (str(_d.index[seq]), str(_d.values[seq][0])))
        buf = StringIO()
        plt.savefig(buf, dpi=50, fmt="png")
        response = make_response(buf.getvalue())
        response.headers['Content-Type'] = 'Image/png'
        return response
df.applymap(f)
print df
print "*"*15

print "Definimos de nuevo el dataframe"
df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]})
print df
print "añadimos columnas combinando las actuales"
df["C"] = df["A"]+df["B"]
df["D"] = df["A"]*3
df["E"] = np.sqrt(df["A"])
print df
print "*"*15
print "Datos disponibles de un dataframe"
print " descripcion del dataframe"
print df.describe()
print " covarianza "
print df.cov()
print " correlación "
print df.corr()
print "*"*15

print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas "
print " DataFrame(np.random.randn(1000,2),columns=['x','y'])"
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
print plot_df
print "Mostramos las graficas"
plot_df.plot()
plot_df.hist()

Ejemplo n.º 14
0
df5["sum_col"] = df5.apply(sum_two_cols, axis=1)

print(df5)

import math


def int_float_squares(series):
    return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2})


print(df.apply(int_float_squares, axis=1))

### 7. Basic Stats ###

print(df.describe())
print(df.cov())
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###
Ejemplo n.º 15
0
                  squeeze=True,
                  date_parser=parser)
price = series.iloc[:, [6]].fillna(method='ffill')
open = series.iloc[:, [0]].fillna(method='ffill')
high = series.iloc[:, [1]].fillna(method='ffill')

model = ARIMA(price, order=(5, 1, 0), missing='nan')
model_fit = model.fit(disp=0)
print(model_fit.summary())
# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

# price = read_csv('data/bitstamp.csv', header=0,parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
# X = price.iloc[:, [6]].fillna(method='ffill').head(100000).values
X = price.values
openValues = open.values
highValues = high.values
size = int(len(X) * 0.66)
series = read_csv('CMPE-256-Large-Scale-Analytics-/data/bitstamp.csv',
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True,
                  date_parser=parser)
# X = series.iloc[:,[6]].fillna(method = 'ffill').head(1000).values
price = series.iloc[:, [6]].fillna(method='ffill').head(1000).values
Ejemplo n.º 16
0
# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
ds = np.DataSource(None)
f = ds.open('https://dl.dropbox.com/u/956851/game_modified.csv')
d_web = pd.read_csv(f)
print(d_web)
Ejemplo n.º 17
0
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
obj.value_counts()
# 统计个数后默认排序,也可以不排序
pd.value_counts(obj.values, sort=False)
# 判断是否存在
mask = obj.isin(['b','c'])
Ejemplo n.º 18
0
    'int_col': [1, 2, 6, 8, -1],
    'float_col': [0.1, 0.2, 0.2, 10.1, None],
    'str_col': ['a', 'b', None, 'c', 'a'],
    'groc_col': ['apples', 'bananas', 'coconuts', 'dogfood', None],
    'rev_col': range(4, -1, -1)
})
df2 = DataFrame({
    'first_col': [13, 12, -6, -8, -11],
    'second_col': [10.1, 10.2, 10.2, 110.1, None],
    'str_col': ['a', 'b', None, 'c', 'X'],
    'groc_col': [None, 'bananas', 'coconuts', 'dogfood', None],
    'rev_col': range(4, -1, -1)
})

#stats
df1.describe()  #only shows numbers

#gh.ix[:,['float_col', 'int_col']] less elegant
df1[['float_col', 'int_col']]

df1.fillna(value="waiting")

df1['div_col'] = df1['float_col'] / df1['int_col']

mean = df1['rev_col'].mean()
df1['mean_col'] = mean

new = pd.merge(df1, df2, how='outer', on='str_col')

#quick plotting
import numpy as np
Ejemplo n.º 19
0
# como descendente.
equipos.sort_index(ascending=1)
equipos.sort_index(ascending=0)


# d) Seleccionando la columna 'socios', mostrar el resultado de ordenar por
# valores.
equipos.sort_values(by='socios')


##
# Ejercicio 4
##
# a) Seguiremos trabajando con el DataFrame creado en el ejercicio 1. Mostrar
# un resumen de la información del mismo.
equipos.describe()

# b) Para aumentar la información de nuestros datos, concatenar a nuestro
# DataFrame un objeto DataFrame con la siguiente información:
new_data = {'equipo': ['Atletico de Madrid'],
            'titulos': [29],
            'socios': [48008]}

equipos = equipos.append(new_data,ignore_index=True)


# c) Crear una nueva columna 'posicion' con los siguientes datos:
posicion_values = ['13', np.nan, '3', np.nan, '5', np.nan]

equipos['posicion'] = posicion_values
import datetime
aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0)
          & (aonao.index > datetime.datetime(1980, 1, 1))
          & (aonao.index < datetime.datetime(1989, 1, 1)),
          'NAO'].plot(kind='barh')
plt.close()

#let's do some statistics
aonao.mean()
aonao.max()
aonao.min()
#mean row-wise
aonao.mean(1)

#gets most statistical information
aonao.describe()

#annual ('A') mean
AO_mm = AO.resample("A").mean()
AO_mm.plot(style='g--')
plt.close()

#median
AO_mm = AO.resample("A").median()
AO_mm.plot()
plt.savefig('AnnualMedianValues.png')
plt.close()

#rolling mean
aonao.rolling(window=12, center=False).mean().plot(style='-g')
plt.savefig('RollingMean.png')
Ejemplo n.º 21
0
d = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
x3 = Series(d)
print(x1)
print(x2)
print(x3)

data = {
    '语文': [66, 95, 93, 90, 80],
    '英语': [65, 85, 92, 88, 90],
    '数学': [30, 98, 96, 77, 90]
}
df1 = DataFrame(data)
df2 = DataFrame(data,
                index=['张飞', '关羽', '赵云', '黄忠', '典韦'],
                columns=['英语', '数学', '语文'])
print(df1)

df2 = df2.drop(columns=['语文'])
df2 = df2.drop(index=['张飞'])
df2.rename(columns={'数学': '几何'}, inplace=True)
print(df2.describe())

pysqldf = lambda sql: sqldf(sql, globals())
sql = "select * from df1"
print(pysqldf(sql))

score = DataFrame(pd.read_excel('data.xlsx'))
score.to_excel('data1.xlsx')
# print(score)
Ejemplo n.º 22
0
               ])

print(df)

# In[2]:

import matplotlib.pyplot as plt
import seaborn as sns

# In[3]:

df.info()

# In[4]:

df.describe()

# In[5]:

sns.pairplot(df)

# In[6]:

sns.distplot(df['StockIndexPrice'], hist_kws=dict(edgecolor="black"))

# In[7]:

df.corr()

# In[8]:
Ejemplo n.º 23
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
items = {'apple':{'count':10,'price':1500},
         'banana': {'count':5, 'price': 15000},
         'melon': { 'count':7,'price': 1000},
         'kiwi': {'count':20,'price': 500},
         'mango': {'count':30,'price': 1500},
         'orange': { 'count':4,'price': 700}}
data = DataFrame(items).T
print(data)
print("===============")
# .describe() -> pandas 내장 함수 갯수, 평균, 표준편차, 최소최대 중위값 등등
print(data.describe()) # 데이터의 정보 설명  
Ejemplo n.º 24
0
def collect_stats(df: pd.DataFrame):
    # TODO: also collect stats for str and datetime columns
    return df.describe([q / 100 for q in range(5, 100, 5)])
Ejemplo n.º 25
0
# -*- coding: utf-8 -*-

import numpy as np
from pandas import Series, DataFrame

df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print df
print df.sum()
print df.sum(axis=1)
print

print df.mean(axis=1, skipna=False)
print df.mean(axis=1)
print

print df.idxmax()
print df.cumsum()
print df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
print obj.describe()
Ejemplo n.º 26
0
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
'''

scoring = 'accuracy'
dataframe = DataFrame()
dataframe = read_csv('lags_12months_featurespower_trainset.csv', header=0)
print(dataframe.describe())
# split into input and output
features = dataframe[['t-6', 't-4', 't-2']].copy()
array_feature = features.values
X_train = array_feature[2:226, :]
X_validation = array_feature[227:276, :]

frame_train = read_csv('Trainset.csv', header=0)

array_frame_train = frame_train.values

Y_train = array_frame_train[27:251, 1]

Y_validation = array_frame_train[252:301, 1]

# Spot Check Algorithms
Ejemplo n.º 27
0
             ignore_index=True)

#Delete a column
empDf['dummy'] = 1
empDf
del empDf['dummy']
empDf

#Delete a row
empDf.drop(1)

#Sort a Data Frame
empDf.sort_index(axis=1)
empDf.sort(['isManager', 'name'])

empDf.describe()
empDf.id.corr(empDf.deptId)

#Iterate through a DataFrame
for rowNum, row in auto_data.iterrows():
    for colName, col in row.iteritems():
        #if  pd.isnull(col) :
        print(pd.isnull(col), rowNum, colName)

#----------------------------------------------------------------------------
#                   Data Operations
#----------------------------------------------------------------------------

carDict = {
    'ID': [1, 2, 3, 4, 5],
    'MODEL': ['Taurus', 'Edge', 'Camry', 'Corolla', 'HighLander'],
Ejemplo n.º 28
0
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
empDf.sort_index(axis=1)
empDf.sort(['isManager','name'])

empDf.describe()
empDf.id.corr(empDf.deptId)

#Iterate through a data frame
for rowNum, row in auto_data.iterrows():
    for colName, col in row.iteritems():
        if pd.isnull(col):
            print(pd.isnull(col),rowNum,colName)







Ejemplo n.º 29
0
    def _save(self, data: pd.DataFrame) -> None:
        df = data.describe(**self._describe_args)
        df.index.name = "Statistics"

        df.reset_index(inplace=True)
        super()._save(df)
Ejemplo n.º 30
0
def main():
    # ------------------------------------------------------
    # Creating connection to db
    # ------------------------------------------------------
    # postgres development db engine (registry to case_study db)
    engine = create_engine('postgresql://localhost/case_study')
    # for production database make sure you use something like:
    # engine = create_engine('postgresql://*****:*****@dns/database_name'))
    # also use the following if interested in using psycopg2 database api (for Oracle db's lookup cxOracle)
    # engine = create_engine('postgresql+psycopg2://username:password@dns/database_name'))

    # connection obj to postgres db engine
    connection = engine.connect()

    with open('../Results/table_names.txt', 'w') as f_table_names:
        for table_name in engine.table_names():
            f_table_names.write(table_name + '\n')
    f_table_names.close()

    # ------------------------------------------------------
    # Available tables in the db
    # ------------------------------------------------------
    skip_list = [
        # 'biannual_data_1', 'biannual_data_2',
        # 'monthly_data_1', 'monthly_data_2', 'monthly_data_3'
        # ,'monthly_data_4', 'monthly_data_5', 'monthly_data_6', 'monthly_data_7', 'monthly_data_8'
        # ,'monthly_data_9', 'monthly_data_10', 'monthly_data_11', 'monthly_data_12'
        'quarterly_data_1',
        'quarterly_data_2',
        'quarterly_data_3',
        'quarterly_data_4',
        'airport',
        'carrier_history',
        'system_fields',
        'yearly_data'
    ]

    for table_name in engine.table_names():
        if table_name in skip_list:
            continue
        else:
            print(35 * '-')
            print('Current table -->', table_name)
            print(35 * '-')

            # --------------------------------------------------------------
            # Analysing data set using pandas data-frame (df) data structure
            # --------------------------------------------------------------
            #
            stmt = "SELECT * FROM {} where unique_carrier in ('EV', 'WN', 'DL') and origin in" \
                   " ('PHX','LAS','IAH','SFO','LAX','DEN','DFW','ATL','ORD','EWR','MDW','LGA');".format(table_name)
            result_proxy = connection.execute(stmt)
            results = result_proxy.fetchall()

            # select column names of current table
            stmt = "SELECT column_name FROM information_schema.columns WHERE table_name = '{}';".format(
                table_name)
            result_proxy = connection.execute(stmt)
            col_names = result_proxy.fetchall()
            col_names = [name[0]
                         for name in col_names]  # remove str from tuple

            df = DataFrame(results, columns=col_names)

            print(100 * '#')

            # some df attributes
            print('\nDimensions of df\n', 50 * '-')
            print(df.shape)
            print('\nData-types of df\n', 50 * '-')
            print(df.dtypes)

            # some df methods
            print('\nTop 5 rows:\n', 50 * '-')
            print(df.head())  # see first 5 rows

            print('\ndf description (categorical columns only):\n', 50 * '-')
            cat_analytics = df.describe(include=['object'
                                                 ])  # 'int64', 'float64']))
            cat_analytics.to_csv(
                '../Results/lim_cat_analytics_{}.csv'.format(table_name),
                encoding='utf-8')  #, index=False)
            print(cat_analytics)

            print('\ndf description (numerical columns only)\n', 50 * '-')
            num_analytics = df.describe()
            num_analytics.to_csv(
                '../Results/lim_num_analytics_{}.csv'.format(table_name),
                encoding='utf-8')  #, index=False)
            print(num_analytics)

            print(100 * '#')

    connection.close()
Ejemplo n.º 31
0
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
def create_fip(year = None):
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)
    log.info("{}".format(fip.describe()))
    log.info("{}".format(fip.info()))

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité"
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus"
    # TODO: find a more explicit message

#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    log.info(u"longueur fip {}".format(len(fip)))

    fip['to_keep'] = np.nan
    fip.update(type_FG)

    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip)))

    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    pac['naia'] = pac.naia.astype('int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
#   We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"longueur pacInd1 {}".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"longueur pacInd2 {}".format(len(pac_ind2)))
    log.info(u"pacInd1 & pacInd2 créés")

    log.info("{}".format(pac_ind1.duplicated().sum()))
    log.info("{}".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
                log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd)))
    log.info("{}".format(pac_ind2.type_pac.isnull().sum()))
    log.info("{}".format(pacInd.type_pac.value_counts()))

    log.info(u"    2.2 : pacInd created")

    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident (ménage) for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
# indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")]
    individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  # TODO: declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    # TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("{}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Ejemplo n.º 33
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Ejemplo n.º 34
0
"""
Student: Max Sorto
Class: IT5090G - Aasheim
Date: 03/31/2016
Assignment: Lab10
"""

from pandas import DataFrame

data = {'name':['Joe','John','Mary','Lee'],
	'quiz 1':[100,87,99,78],
	'quiz 2':[45,78,90,88],
	'assign 1':[98,82,93,78],
	'assign 2':[100,87,99,78]
}

frame = DataFrame(data)
print frame
print '\n'
print frame.describe()
        lambda x: 1 if x == 'x' or x == 'X' else 0)
    df_labels['Asco\nTristeza\nRabia'] = df_labels[
        'Asco\nTristeza\nRabia'].apply(lambda x: 1
                                       if x == 'x' or x == 'X' else 0)

    df_labels = df_labels[[
        'id', 'tweet_id', 'user_name', 'QUIEN', 'Asesinato', 'Violacion',
        'Agresion \nsexual', 'Maltrato', 'Acoso', 'Miedo',
        'Asco\nTristeza\nRabia', 'full_text'
    ]]
    df_labels.set_index('id', inplace=True)
    df_labels = df_labels[isfinite(df_labels['QUIEN'])]

    concat_df_labels = concat([concat_df_labels, df_labels], axis=0)

concat_df_labels.describe()

concat_df_labels.sample(5)

concat_df_labels.columns


def get_category(x):
    if x['Asesinato']:
        return 0
    elif x['Violacion']:
        return 1
    elif x['Agresion \nsexual']:
        return 2
    elif x['Maltrato']:
        return 3
Ejemplo n.º 36
0
#2
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats

pd.value_counts(cats)

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


###检测和过滤异常值
#1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

col = data[3]
col[np.abs(col) > 3]

data[(np.abs(data) > 3).any(1)]

#2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()


###排列与随机采样
#1
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
Ejemplo n.º 37
0
for r in range(repeats):
    print("For iteration: " + str(r))

    # fit the model
    lstm_model = fit_lstm(train_scaled, 1, 3000, 4)
    # forecast the entire training dataset to build up state for forecasting
    train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
    lstm_model.predict(train_reshaped, batch_size=1)
    # walk-forward validation on the test data
    predictions = list()
    for i in range(len(test_scaled)):
        # make one-step forecast
        X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
        yhat = forecast_lstm(lstm_model, 1, X)
        # invert scaling
        yhat = invert_scale(scaler, X, yhat)
        # invert differencing
        yhat = inverse_difference(raw_values, yhat, len(test_scaled) + 1 - i)
        # store forecast
        predictions.append(yhat)
    # report performance
    rmse = sqrt(mean_squared_error(raw_values[-12:], predictions))
    print('%d) Test RMSE: %.3f' % (r + 1, rmse))
    error_scores.append(rmse)

# summarize results
results = DataFrame()
results['rmse'] = error_scores
print(results.describe())
results.boxplot()
pyplot.show()
Ejemplo n.º 38
0
#2
data = np.random.randn(1000)  # Normally distributed
# qcut 是和cut擦汗不多,是根据数据的分位数划分数据
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

pd.value_counts(cats)

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

###检测和过滤异常值
#1随机生成符合正态分布的随机数组
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
# 查看数据的分布状况
data.describe()

col = data[3]
# 找出绝对值大于3 的值
col[np.abs(col) > 3]

data[(np.abs(data) > 3).any(1)]

#2 .sign 是符号判断函数
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

###排列与随机采样
#1 随机产生一个dataFrame
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
# 对数据进行排序
print dict_of_lists,'\n'
        
#convert the list of values for each key in dictionary to Series
countries = Series(dict_of_lists['countries'])
gold = Series(dict_of_lists['gold'])
silver = Series(dict_of_lists['silver'])
bronze= Series(dict_of_lists['bronze'])
       
#construct a dictionary of Series' that can be turned in to a DataFrame
medal_tally_dict = {'countries' : countries, 'gold': gold, 'silver': silver, 'bronze': bronze}
df = DataFrame(medal_tally_dict)
print df,'\n'

#DataFrame and Series properties
print df[['countries','gold']],'\n'
print df.describe(),'\n'
print 'The gold series in the dataframe is of dtype : ',df['gold'].dtype 
print 'Number countries : ',len(df['countries'])  
print 'The mean of golds won where bronze medals greater than 5 : ',df['gold'][df['bronze']>=2].mean()
print 'Mean of gold and bronze medal counts : ',(df['gold']+df['silver']).mean()
print 'Mean of golds : ',df['gold'].mean()
print 'Mean of bronzes : ',df['bronze'].mean()
print 'Max number of golds won by a country : ',df['gold'].max()
print 'Sum of golds won by all countries : ',df['gold'].sum()

# WIP
def standardize_data(values):
    standardized_values = (values - values.mean() ) / values.std()
    print standardized_values,'\n'

with open('C:\omnica\data\managers1.csv','w') as csvfile:
Ejemplo n.º 40
0
    def test_column_dups_operations(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = list(range(2))
        df = DataFrame(arr, columns=["A", "A"])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range("20130101", periods=4, freq="Q-NOV")
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=["a", "a", "a", "a"])
        df.columns = idx
        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                             columns=idx)
        check(df, expected)

        # insert
        df = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
            columns=["foo", "bar", "foo", "hello"],
        )
        df["string"] = "bah"
        expected = DataFrame(
            [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
            columns=["foo", "bar", "foo", "hello", "string"],
        )
        check(df, expected)
        with pytest.raises(ValueError, match="Length of value"):
            df.insert(0, "AnotherColumn", range(len(df.index) - 1))

        # insert same dtype
        df["foo2"] = 3
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3],
             [2, 1, 3, 5, "bah", 3]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # set (non-dup)
        df["foo2"] = 4
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4],
             [2, 1, 3, 5, "bah", 4]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)
        df["foo2"] = 3

        # delete (non dup)
        del df["bar"]
        expected = DataFrame(
            [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
            columns=["foo", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # try to delete again (its not consolidated)
        del df["hello"]
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
            columns=["foo", "foo", "string", "foo2"],
        )
        check(df, expected)

        # consolidate
        df = df._consolidate()
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
            columns=["foo", "foo", "string", "foo2"],
        )
        check(df, expected)

        # insert
        df.insert(2, "new_col", 5.0)
        expected = DataFrame(
            [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3],
             [2, 3, 5.0, "bah", 3]],
            columns=["foo", "foo", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # insert a dup
        with pytest.raises(ValueError, match="cannot insert"):
            df.insert(2, "new_col", 4.0)

        df.insert(2, "new_col", 4.0, allow_duplicates=True)
        expected = DataFrame(
            [
                [1, 1, 4.0, 5.0, "bah", 3],
                [1, 2, 4.0, 5.0, "bah", 3],
                [2, 3, 4.0, 5.0, "bah", 3],
            ],
            columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # delete (dup)
        del df["foo"]
        expected = DataFrame(
            [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
            columns=["new_col", "new_col", "string", "foo2"],
        )
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame(
            [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
            columns=["foo", "bar", "foo", "hello"],
        )
        check(df)

        df["foo2"] = 7.0
        expected = DataFrame(
            [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        result = df["foo"]
        expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]],
                             columns=["foo", "foo"])
        check(result, expected)

        # multiple replacements
        df["foo"] = "string"
        expected = DataFrame(
            [
                ["string", 1, "string", 5, 7.0],
                ["string", 1, "string", 5, 7.0],
                ["string", 1, "string", 5, 7.0],
            ],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        del df["foo"]
        expected = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]],
                             columns=["bar", "hello", "foo2"])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        assert (result == expected).all().all()

        # rename, GH 4403
        df4 = DataFrame(
            {
                "RT": [0.0454],
                "TClose": [22.02],
                "TExg": [0.0422]
            },
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=["STK_ID", "RPT_Date"]),
        )

        df5 = DataFrame(
            {
                "RPT_Date": [20120930, 20121231, 20130331],
                "STK_ID": [600809] * 3,
                "STK_Name": ["饡驦", "饡驦", "饡驦"],
                "TClose": [38.05, 41.66, 30.01],
            },
            index=MultiIndex.from_tuples(
                [(600809, 20120930), (600809, 20121231), (600809, 20130331)],
                names=["STK_ID", "RPT_Date"],
            ),
        )

        k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
        result = k.rename(columns={
            "TClose_x": "TClose",
            "TClose_y": "QT_Close"
        })
        str(result)
        result.dtypes

        expected = DataFrame(
            [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
            columns=[
                "RT",
                "TClose",
                "TExg",
                "RPT_Date",
                "STK_ID",
                "STK_Name",
                "QT_Close",
            ],
        ).set_index(["STK_ID", "RPT_Date"], drop=False)
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]],
                       columns=["bar", "a", "a"])
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=["bar"])
        with pytest.raises(ValueError, match=msg):
            df.reindex(columns=["bar", "foo"])

        # drop
        df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]],
                       columns=["bar", "a", "a"])
        result = df.drop(["a"], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=["bar"])
        check(result, expected)
        result = df.drop("a", axis=1)
        check(result, expected)

        # describe
        df = DataFrame(
            [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
            columns=["bar", "a", "a"],
            dtype="float64",
        )
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(
            np.random.randn(5, 3),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "A"],
        )
        for index in [df.index, pd.Index(list("edcba"))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame(
                {
                    "A": expected_ser,
                    "B": this_df["B"],
                    "A": expected_ser
                },
                columns=["A", "B", "A"],
            )
            this_df["A"] = index
            check(this_df, expected_df)

        # operations
        for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ["A", "A"]
            df.columns = ["A", "A"]
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
        expected = DataFrame(1.0, index=range(5), columns=["that", "that"])

        df["that"] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
        expected = DataFrame(1, index=range(5), columns=["that", "that"])

        df["that"] = 1
        check(df, expected)
Ejemplo n.º 41
0
    return df


df = df.apply(plus, axis=1, args=(
    2,
    3,
))
print(df)

# 数据统计
# describe() 函数是个统计大礼包,可以快速让我们对数据有个全面的了解
df1 = DataFrame({
    'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c', 'd'],
    'data1': range(6)
})
print(df1, df1.describe())

# 数据表合并
df1 = DataFrame({
    'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c'],
    'data1': range(5)
})
df2 = DataFrame({
    'name': ['ZhangFei', 'GuanYu', 'A', 'B', 'C'],
    'data2': range(5)
})
# 1. 基于指定列进行连接
df3 = pd.merge(df1, df2, on='name')
print(df3)
# 2. inner 内连接
#  内链接是 merge 合并的默认情况,inner 内连接其实也就是键的交集,在这里 df1, df2 相同的键是 name
Ejemplo n.º 42
0
df.idxmax()  # 计算每一列最大值的索引
'''
one    b
two    b
'''
print
df.cumsum()  # 每一列的累加和
'''
   one  two
a  1.0  NaN
b  8.0  4.0
c  NaN  NaN
d  8.0  5.0
'''
print
df.describe()  # 对DataFrame每列计算汇总统计
'''
            one      two
count  3.000000  2.00000
mean   2.666667  2.50000
std    3.785939  2.12132
min    0.000000  1.00000
25%         NaN      NaN
50%         NaN      NaN
75%         NaN      NaN
max    7.000000  4.00000
'''
obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c'])
print
obj.describe()  # 对Series计算汇总统计
'''
print()

#limits - Define the number of bins (Categories) you want to create using the precision parameter and the 2nd argument.
print (pd.cut(prime_nos,3,precision=1))
print()

#--------- NEXT LECTURE ---------#

#Observations on a dataframe - what you do when given a dataframe - initial view
df = DataFrame(np.random.randn(1000,5))

#basic observation
print (df.head()) #Returns the top 5 rows including dataframe header
print (df.tail()) #Returns the last 5 rows

print (df.describe()) #Gives basic statistical info about the data of the dataframe

column = df[0]
print (column.head())
print()
print (column[np.abs(column)>3])
print()

print (df([(np.abs(df)>3).any(1)]))

df[(np.abs(df)>3)] = np.sign(df)*5

#print (df.describe())


pyplot.show()


# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

# http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.predict.html

X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
	model = ARIMA(history, order=(5,1,0))
	model_fit = model.fit(disp=0)
	output = model_fit.forecast()
	yhat = output[0]
	predictions.append(yhat)
	obs = test[t]
Ejemplo n.º 45
0
import numpy as np
randn = np.random.randn
import pandas as pd
from pandas import Series, DataFrame

np.random.seed(12345)

data = DataFrame(np.random.randn(1000, 4))
print data.describe()

col = data[3]
print col[np.abs(col) > 3]
print data[(np.abs(data) > 3).any(1)]  #To select all rows having a value exceeding 3 or -3, you can use the any method on a boolean DataFrame:
data[np.abs(data) > 3] = np.sign(data) * 3
print data.describe()
p=DataFrame(predicted_probs)


# In[186]:

p.shape


# In[187]:

p.head(2)


# In[188]:

p.describe()


# In[189]:

get_ipython().magic(u'pinfo lr.predict_proba')


# In[190]:

p1=p[0]
p2=p[1]


# In[192]:
Ejemplo n.º 47
0
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
for tic, data in all_data.iteritems()})
price
volume = DataFrame({tic: data['Volume'] 
Ejemplo n.º 48
-1
def descriptiveStatsDataFrame():
    df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two'])
    print (df)
    print ('Column Sum: \n{}'.format(df.sum(axis=0)))
    print ('Row Sum: \n{}'.format(df.sum(axis=1)))
    print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False)))
    print ('Index with min Value: \n{}'.format(df.idxmin()))
    print ('Summary Statistic: \n{}'.format(df.describe()))