def deal_outlier(): np.random.seed(12345) data=DataFrame(np.random.randn(1000,4)) print data.describe() print data[3][np.abs(data[3])>3] print data[(np.abs(data)>3).any(1)] data[np.abs(data)>3]=np.sign(data)*3 print data.describe()
def slide_15(): np.random.seed(12345) data = DataFrame(np.random.randn(1000, 4)) print data.describe() col = data[3] print col[np.abs(col) > 3] print data[(np.abs(data) > 3).any(1)] data[np.abs(data) > 3] = np.sign(data) * 3 print data.describe()
def test_describe_no_numeric(self): df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, 'B': ['a', 'b', 'c', 'd'] * 6}) desc = df.describe() expected = DataFrame(dict((k, v.describe()) for k, v in compat.iteritems(df)), columns=df.columns) assert_frame_equal(desc, expected) ts = tm.makeTimeSeries() df = DataFrame({'time': ts.index}) desc = df.describe() self.assertEqual(desc.time['first'], min(ts.index))
def pd_04(): obj=Series(range(4),index=['d','a','b','c']) print obj print obj.sort_index() frame=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','b','c']) print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(ascending=False) obj1=Series([4,7,-3,2]) print obj1.order() print frame.sort_index(by='b') print frame.sort_index(by=['a','b']) print frame.describe()
class Describe(object): def setup(self): self.df = DataFrame({ 'a': np.random.randint(0, 100, int(1e6)), 'b': np.random.randint(0, 100, int(1e6)), 'c': np.random.randint(0, 100, int(1e6)) }) def time_series_describe(self): self.df['a'].describe() def time_dataframe_describe(self): self.df.describe()
def calcCVA(self, expectedExposure=array): cvaData = DataFrame() cvaData['t'] = self.timesteps[1:] cvaData['discountFactor'] = self.discountFactors cvaData['pd'] = self.pD[1:] cvaData['1-R'] = [1 - self.recoveryRate] * len(self.pD[1:]) cvaData['exposure'] = [getTPlusFromList(expectedExposure, i, True) for i in range(len(expectedExposure))] cvaData['cvaPerTimeStep'] = cvaData['discountFactor'] * cvaData['pd'] * cvaData['1-R'] * cvaData['exposure'] cva = cvaData['cvaPerTimeStep'].sum() cvaData.describe() print cvaData print 'CVA = ', cva return cva return True
def test_iloc_getitem_doc_issue(self): # multi axis slicing issue with single block # surfaced in GH 6059 arr = np.random.randn(6, 4) index = date_range('20130101', periods=6) columns = list('ABCD') df = DataFrame(arr, index=index, columns=columns) # defines ref_locs df.describe() result = df.iloc[3:5, 0:2] str(result) result.dtypes expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) tm.assert_frame_equal(result, expected) # for dups df.columns = list('aaaa') result = df.iloc[3:5, 0:2] str(result) result.dtypes expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list('aa')) tm.assert_frame_equal(result, expected) # related arr = np.random.randn(6, 4) index = list(range(0, 12, 2)) columns = list(range(0, 8, 2)) df = DataFrame(arr, index=index, columns=columns) df._data.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] str(result) result.dtypes expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected)
def combine_models(): model_nums = [1, 2, 3] # model_nums = [2, 3] model_paths = ['model%03d.y_test.csv' % i for i in model_nums] assert all(os.path.exists(path) for path in model_paths) # y1 = pd.read_csv('model001.y_test.csv').set_index('job_id') # y2 = pd.read_csv('model002.y_test.csv').set_index('job_id') # y3 = pd.read_csv('model003.y_test.csv').set_index('job_id') models = [pd.read_csv(path).set_index('job_id') for path in model_paths] path = 'all/jobs_all.csv' df = get_data(path) df_train, df_test = split_train_test(df) y_data = np.ones((len(df_test), len(models)), dtype=int) * -1 y = DataFrame(y_data, columns=model_nums, index=df_test.index) for d in [y] + models: print(d.describe()) for d in [y] + models: print(d.shape, len(y) - len(d), type(d)) y_indexes = set(y.index) print('y_indexes: %s' % sorted(y_indexes)[:10]) for c, d in zip(model_nums, models): d_indexes = set(d.index) print('c=%s, d_indexes: %s' % (c, sorted(d_indexes)[:10])) assert d_indexes.issubset(y_indexes), (len(d_indexes - y_indexes)) y[c].loc[d.index] = d['hat'] def func(row): return all(x == -1 for x in row) # empties = y.apply(func, axis=1) # print('empties: %d' % len(empties)) # print(y[empties]) def vote(row): print(row) for j in 1, 3, 2: if row[j] != -1: return row[j] # assert False, row return -3 print(y.iloc[:20, :]) y_series = y.iloc[:20, :].apply(vote, axis=1) assert False y_test = DataFrame(y_series, columns=['hat'], index=y.index) y_test.to_csv('%s.y_test.csv' % 'model004v', index_label='job_id') print(y_test.columns) print(y_test.describe()) print(y_test.iloc[:10, :])
def test_describe_objects(self): df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']}) result = df.describe() expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]}, index=['count', 'unique', 'top', 'freq']) assert_frame_equal(result, expected) df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')}) df.loc[4] = pd.Timestamp('2010-01-04') result = df.describe() expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'), pd.Timestamp('2010-01-04'), pd.Timestamp('2010-01-04'), 2]}, index=['count', 'unique', 'first', 'last', 'top', 'freq']) assert_frame_equal(result, expected) # mix time and str df['C2'] = ['a', 'a', 'b', 'c', 'a'] result = df.describe() # when mix of dateimte / obj the index gets reordered. expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3] assert_frame_equal(result, expected) # just str expected = DataFrame({'C2': [5, 3, 'a', 4]}, index=['count', 'unique', 'top', 'freq']) result = df[['C2']].describe() # mix of time, str, numeric df['C3'] = [2, 4, 6, 8, 2] result = df.describe() expected = DataFrame({"C3": [5., 4.4, 2.607681, 2., 2., 4., 6., 8.]}, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) assert_frame_equal(result, expected) assert_frame_equal(df.describe(), df[['C3']].describe()) assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe())
def messages_data(soup,message_csv): messages = scrape_element(soup, 'messages', '.Message') msg_lengths = [] pd.set_option('display.max_colwidth', -1) for k, v in messages.items(): msg_lengths.append(len(v)) text = Series(str(np.array(v.encode('utf-8')))) print text text.to_csv(message_csv, sep=',', header=False, index=False, mode='a') df_msg_lgth = DataFrame(msg_lengths) df_msg_describe = DataFrame(df_msg_lgth.describe()).T cols = df_msg_describe.columns df_msg_describe.columns = ['msg_' + c for c in cols] return df_msg_describe
def edbSave(): '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据' # 获取客户剪切板中的代码及输入的起始与结束日期 codes = getCodeFromClipboard() start = sDate() end = eDate() data = w.edb(codes, start, end, "Fill=Previous") datachg = [d.strftime('%y-%m-%d') for d in data.Times] df = DataFrame(data.Data, index=data.Codes, columns=datachg).T print('-' * 85) print(df) print('-' * 85) print('统计指标:') print(df.describe()) print("sum", " " * 3, str(df.sum()).split(sep=" ")[1].rjust(10)) return df
def StatisticDescribeDatas(): """ 对数据做描述性统计分析 :return: """ from matplotlib import pyplot as plt from pandas import DataFrame laborary_operation = ArchlaboraryOperation() laborary_operation.prepareForProjectDataTable() res, _ = laborary_operation.getResponse() if res: _res_data = laborary_operation.getData() columns = [] datas = [] for _h in _res_data['properties']: columns.append(_h.label) for _data_item in _res_data['records']: _tmp_array = [] for _pi in _res_data['properties']: if _res_data['records'][_data_item][_pi.label]: _val = _res_data['records'][_data_item][_pi.label].loadValueLabel() _tmp_array.append(float(_val) if _val.isdigit() else _val) datas.append(_tmp_array) df = DataFrame(datas, columns=columns) fig = plt.figure() #ax = fig.add_subplot(1, 1, 1) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) _d = df.describe() print("echo describe ......") print(_d) ax.plot(_d, 'r--') for seq in range(0, len(_d)): ax.text(seq, _d.values[seq][0], "%s(%s)" % (str(_d.index[seq]), str(_d.values[seq][0]))) buf = StringIO() plt.savefig(buf, dpi=50, fmt="png") response = make_response(buf.getvalue()) response.headers['Content-Type'] = 'Image/png' return response
df.applymap(f) print df print "*"*15 print "Definimos de nuevo el dataframe" df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]}) print df print "añadimos columnas combinando las actuales" df["C"] = df["A"]+df["B"] df["D"] = df["A"]*3 df["E"] = np.sqrt(df["A"]) print df print "*"*15 print "Datos disponibles de un dataframe" print " descripcion del dataframe" print df.describe() print " covarianza " print df.cov() print " correlación " print df.corr() print "*"*15 print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas " print " DataFrame(np.random.randn(1000,2),columns=['x','y'])" plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y']) print plot_df print "Mostramos las graficas" plot_df.plot() plot_df.hist()
df5["sum_col"] = df5.apply(sum_two_cols, axis=1) print(df5) import math def int_float_squares(series): return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2}) print(df.apply(int_float_squares, axis=1)) ### 7. Basic Stats ### print(df.describe()) print(df.cov()) print(df.corr()) ### 8. Merge and Join ### print(df) other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]}) print(other) print(pd.merge(df, other, on="str_col", how="inner")) print(pd.merge(df, other, on="str_col", how="outer")) print(pd.merge(df, other, on="str_col", how="left")) print(pd.merge(df, other, on="str_col", how="right")) ### 9. Plot ###
squeeze=True, date_parser=parser) price = series.iloc[:, [6]].fillna(method='ffill') open = series.iloc[:, [0]].fillna(method='ffill') high = series.iloc[:, [1]].fillna(method='ffill') model = ARIMA(price, order=(5, 1, 0), missing='nan') model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) # price = read_csv('data/bitstamp.csv', header=0,parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) # X = price.iloc[:, [6]].fillna(method='ffill').head(100000).values X = price.values openValues = open.values highValues = high.values size = int(len(X) * 0.66) series = read_csv('CMPE-256-Large-Scale-Analytics-/data/bitstamp.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) # X = series.iloc[:,[6]].fillna(method = 'ffill').head(1000).values price = series.iloc[:, [6]].fillna(method='ffill').head(1000).values
# データフレームを作る smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.6, 1.7, 3.5, 4.3] } frame = DataFrame(smp) # データフレームの要素へのアクセス frame.year # frame$year frame['year'] # frame$year frame.head() # head frame.tail() # tail frame2 = DataFrame( smp, index=['one', 'two', 'three', 'four', 'five']) # インデックスを追加 frame2.ix['one'] frame2.describe() # summary print(frame2.describe()) # データを読み込む data = pd.read_csv('stock_px.csv') print(data) xlsx_file = pd.ExcelFile('stock_px.xlsx') # openpyxlのインストールが必要, xlsも可 xlsx_file.sheet_names data = xlsx_file.parse('stock_px') print(data) # web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html ds = np.DataSource(None) f = ds.open('https://dl.dropbox.com/u/956851/game_modified.csv') d_web = pd.read_csv(f) print(d_web)
obj['a'] ## 汇总和计算描述统计 df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=['a','b','c','d'], columns=['one','two']) # 对列 df.sum() # 对行 df.sum(axis=1) # 默认会排除NA,但是可以通过skipna禁用该功能 df.mean(axis=1,skipna=False) # 返回最大值的索引 df.idxmax() # 累加 df.cumsum() df.describe() # 相关系数 returns.MSFT.corr(returns.IBM) returns.corr() returns.cov() returns.corrwith(returns.IBM) ## 唯一值,值计数以及成员资格 obj = Series(['c','a','d','a','a','b','b','c','c']) uniques = obj.unique() # 统计个数 obj.value_counts() # 统计个数后默认排序,也可以不排序 pd.value_counts(obj.values, sort=False) # 判断是否存在 mask = obj.isin(['b','c'])
'int_col': [1, 2, 6, 8, -1], 'float_col': [0.1, 0.2, 0.2, 10.1, None], 'str_col': ['a', 'b', None, 'c', 'a'], 'groc_col': ['apples', 'bananas', 'coconuts', 'dogfood', None], 'rev_col': range(4, -1, -1) }) df2 = DataFrame({ 'first_col': [13, 12, -6, -8, -11], 'second_col': [10.1, 10.2, 10.2, 110.1, None], 'str_col': ['a', 'b', None, 'c', 'X'], 'groc_col': [None, 'bananas', 'coconuts', 'dogfood', None], 'rev_col': range(4, -1, -1) }) #stats df1.describe() #only shows numbers #gh.ix[:,['float_col', 'int_col']] less elegant df1[['float_col', 'int_col']] df1.fillna(value="waiting") df1['div_col'] = df1['float_col'] / df1['int_col'] mean = df1['rev_col'].mean() df1['mean_col'] = mean new = pd.merge(df1, df2, how='outer', on='str_col') #quick plotting import numpy as np
# como descendente. equipos.sort_index(ascending=1) equipos.sort_index(ascending=0) # d) Seleccionando la columna 'socios', mostrar el resultado de ordenar por # valores. equipos.sort_values(by='socios') ## # Ejercicio 4 ## # a) Seguiremos trabajando con el DataFrame creado en el ejercicio 1. Mostrar # un resumen de la información del mismo. equipos.describe() # b) Para aumentar la información de nuestros datos, concatenar a nuestro # DataFrame un objeto DataFrame con la siguiente información: new_data = {'equipo': ['Atletico de Madrid'], 'titulos': [29], 'socios': [48008]} equipos = equipos.append(new_data,ignore_index=True) # c) Crear una nueva columna 'posicion' con los siguientes datos: posicion_values = ['13', np.nan, '3', np.nan, '5', np.nan] equipos['posicion'] = posicion_values
import datetime aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) & (aonao.index > datetime.datetime(1980, 1, 1)) & (aonao.index < datetime.datetime(1989, 1, 1)), 'NAO'].plot(kind='barh') plt.close() #let's do some statistics aonao.mean() aonao.max() aonao.min() #mean row-wise aonao.mean(1) #gets most statistical information aonao.describe() #annual ('A') mean AO_mm = AO.resample("A").mean() AO_mm.plot(style='g--') plt.close() #median AO_mm = AO.resample("A").median() AO_mm.plot() plt.savefig('AnnualMedianValues.png') plt.close() #rolling mean aonao.rolling(window=12, center=False).mean().plot(style='-g') plt.savefig('RollingMean.png')
d = {'a': 1, 'b': 2, 'c': 3, 'd': 4} x3 = Series(d) print(x1) print(x2) print(x3) data = { '语文': [66, 95, 93, 90, 80], '英语': [65, 85, 92, 88, 90], '数学': [30, 98, 96, 77, 90] } df1 = DataFrame(data) df2 = DataFrame(data, index=['张飞', '关羽', '赵云', '黄忠', '典韦'], columns=['英语', '数学', '语文']) print(df1) df2 = df2.drop(columns=['语文']) df2 = df2.drop(index=['张飞']) df2.rename(columns={'数学': '几何'}, inplace=True) print(df2.describe()) pysqldf = lambda sql: sqldf(sql, globals()) sql = "select * from df1" print(pysqldf(sql)) score = DataFrame(pd.read_excel('data.xlsx')) score.to_excel('data1.xlsx') # print(score)
]) print(df) # In[2]: import matplotlib.pyplot as plt import seaborn as sns # In[3]: df.info() # In[4]: df.describe() # In[5]: sns.pairplot(df) # In[6]: sns.distplot(df['StockIndexPrice'], hist_kws=dict(edgecolor="black")) # In[7]: df.corr() # In[8]:
from pandas import Series, DataFrame import pandas as pd import numpy as np items = {'apple':{'count':10,'price':1500}, 'banana': {'count':5, 'price': 15000}, 'melon': { 'count':7,'price': 1000}, 'kiwi': {'count':20,'price': 500}, 'mango': {'count':30,'price': 1500}, 'orange': { 'count':4,'price': 700}} data = DataFrame(items).T print(data) print("===============") # .describe() -> pandas 내장 함수 갯수, 평균, 표준편차, 최소최대 중위값 등등 print(data.describe()) # 데이터의 정보 설명
def collect_stats(df: pd.DataFrame): # TODO: also collect stats for str and datetime columns return df.describe([q / 100 for q in range(5, 100, 5)])
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print print df.mean(axis=1, skipna=False) print df.mean(axis=1) print print df.idxmax() print df.cumsum() print df.describe() obj = Series(['a', 'a', 'b', 'c'] * 4) print obj.describe()
# Split-out validation dataset array = dataset.values X = array[:,0:4] Y = array[:,4] validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 ''' scoring = 'accuracy' dataframe = DataFrame() dataframe = read_csv('lags_12months_featurespower_trainset.csv', header=0) print(dataframe.describe()) # split into input and output features = dataframe[['t-6', 't-4', 't-2']].copy() array_feature = features.values X_train = array_feature[2:226, :] X_validation = array_feature[227:276, :] frame_train = read_csv('Trainset.csv', header=0) array_frame_train = frame_train.values Y_train = array_frame_train[27:251, 1] Y_validation = array_frame_train[252:301, 1] # Spot Check Algorithms
ignore_index=True) #Delete a column empDf['dummy'] = 1 empDf del empDf['dummy'] empDf #Delete a row empDf.drop(1) #Sort a Data Frame empDf.sort_index(axis=1) empDf.sort(['isManager', 'name']) empDf.describe() empDf.id.corr(empDf.deptId) #Iterate through a DataFrame for rowNum, row in auto_data.iterrows(): for colName, col in row.iteritems(): #if pd.isnull(col) : print(pd.isnull(col), rowNum, colName) #---------------------------------------------------------------------------- # Data Operations #---------------------------------------------------------------------------- carDict = { 'ID': [1, 2, 3, 4, 5], 'MODEL': ['Taurus', 'Edge', 'Camry', 'Corolla', 'HighLander'],
empDf.append(Series([5,False,'Derek',2], index=['id','isManager','name','deptId'], ignore_index=True) empDf #Deleting a column empDf['dummy']=1 empDf del empDf['dummy'] empDf #Deleting a row empDf.sort_index(axis=1) empDf.sort(['isManager','name']) empDf.describe() empDf.id.corr(empDf.deptId) #Iterate through a data frame for rowNum, row in auto_data.iterrows(): for colName, col in row.iteritems(): if pd.isnull(col): print(pd.isnull(col),rowNum,colName)
def _save(self, data: pd.DataFrame) -> None: df = data.describe(**self._describe_args) df.index.name = "Statistics" df.reset_index(inplace=True) super()._save(df)
def main(): # ------------------------------------------------------ # Creating connection to db # ------------------------------------------------------ # postgres development db engine (registry to case_study db) engine = create_engine('postgresql://localhost/case_study') # for production database make sure you use something like: # engine = create_engine('postgresql://*****:*****@dns/database_name')) # also use the following if interested in using psycopg2 database api (for Oracle db's lookup cxOracle) # engine = create_engine('postgresql+psycopg2://username:password@dns/database_name')) # connection obj to postgres db engine connection = engine.connect() with open('../Results/table_names.txt', 'w') as f_table_names: for table_name in engine.table_names(): f_table_names.write(table_name + '\n') f_table_names.close() # ------------------------------------------------------ # Available tables in the db # ------------------------------------------------------ skip_list = [ # 'biannual_data_1', 'biannual_data_2', # 'monthly_data_1', 'monthly_data_2', 'monthly_data_3' # ,'monthly_data_4', 'monthly_data_5', 'monthly_data_6', 'monthly_data_7', 'monthly_data_8' # ,'monthly_data_9', 'monthly_data_10', 'monthly_data_11', 'monthly_data_12' 'quarterly_data_1', 'quarterly_data_2', 'quarterly_data_3', 'quarterly_data_4', 'airport', 'carrier_history', 'system_fields', 'yearly_data' ] for table_name in engine.table_names(): if table_name in skip_list: continue else: print(35 * '-') print('Current table -->', table_name) print(35 * '-') # -------------------------------------------------------------- # Analysing data set using pandas data-frame (df) data structure # -------------------------------------------------------------- # stmt = "SELECT * FROM {} where unique_carrier in ('EV', 'WN', 'DL') and origin in" \ " ('PHX','LAS','IAH','SFO','LAX','DEN','DFW','ATL','ORD','EWR','MDW','LGA');".format(table_name) result_proxy = connection.execute(stmt) results = result_proxy.fetchall() # select column names of current table stmt = "SELECT column_name FROM information_schema.columns WHERE table_name = '{}';".format( table_name) result_proxy = connection.execute(stmt) col_names = result_proxy.fetchall() col_names = [name[0] for name in col_names] # remove str from tuple df = DataFrame(results, columns=col_names) print(100 * '#') # some df attributes print('\nDimensions of df\n', 50 * '-') print(df.shape) print('\nData-types of df\n', 50 * '-') print(df.dtypes) # some df methods print('\nTop 5 rows:\n', 50 * '-') print(df.head()) # see first 5 rows print('\ndf description (categorical columns only):\n', 50 * '-') cat_analytics = df.describe(include=['object' ]) # 'int64', 'float64'])) cat_analytics.to_csv( '../Results/lim_cat_analytics_{}.csv'.format(table_name), encoding='utf-8') #, index=False) print(cat_analytics) print('\ndf description (numerical columns only)\n', 50 * '-') num_analytics = df.describe() num_analytics.to_csv( '../Results/lim_num_analytics_{}.csv'.format(table_name), encoding='utf-8') #, index=False) print(num_analytics) print(100 * '#') connection.close()
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
def create_fip(year = None): assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. temporary_store = TemporaryStore.create(file_name = "erfs") replace = create_replace(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) log.info("{}".format(fip.describe())) log.info("{}".format(fip.info())) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) # TODO: check if useful fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité" assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus" # TODO: find a more explicit message # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) log.info(u"longueur fip {}".format(len(fip))) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" pac['naia'] = pac.naia.astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"longueur pacInd1 {}".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"longueur pacInd2 {}".format(len(pac_ind2))) log.info(u"pacInd1 & pacInd2 créés") log.info("{}".format(pac_ind1.duplicated().sum())) log.info("{}".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd))) log.info("{}".format(pac_ind2.type_pac.isnull().sum())) log.info("{}".format(pacInd.type_pac.value_counts())) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident (ménage) for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] # TODO: declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } # TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("{}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( {'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
""" Student: Max Sorto Class: IT5090G - Aasheim Date: 03/31/2016 Assignment: Lab10 """ from pandas import DataFrame data = {'name':['Joe','John','Mary','Lee'], 'quiz 1':[100,87,99,78], 'quiz 2':[45,78,90,88], 'assign 1':[98,82,93,78], 'assign 2':[100,87,99,78] } frame = DataFrame(data) print frame print '\n' print frame.describe()
lambda x: 1 if x == 'x' or x == 'X' else 0) df_labels['Asco\nTristeza\nRabia'] = df_labels[ 'Asco\nTristeza\nRabia'].apply(lambda x: 1 if x == 'x' or x == 'X' else 0) df_labels = df_labels[[ 'id', 'tweet_id', 'user_name', 'QUIEN', 'Asesinato', 'Violacion', 'Agresion \nsexual', 'Maltrato', 'Acoso', 'Miedo', 'Asco\nTristeza\nRabia', 'full_text' ]] df_labels.set_index('id', inplace=True) df_labels = df_labels[isfinite(df_labels['QUIEN'])] concat_df_labels = concat([concat_df_labels, df_labels], axis=0) concat_df_labels.describe() concat_df_labels.sample(5) concat_df_labels.columns def get_category(x): if x['Asesinato']: return 0 elif x['Violacion']: return 1 elif x['Agresion \nsexual']: return 2 elif x['Maltrato']: return 3
#2 data = np.random.randn(1000) # Normally distributed cats = pd.qcut(data, 4) # Cut into quartiles cats pd.value_counts(cats) pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) ###检测和过滤异常值 #1 np.random.seed(12345) data = DataFrame(np.random.randn(1000, 4)) data.describe() col = data[3] col[np.abs(col) > 3] data[(np.abs(data) > 3).any(1)] #2 data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() ###排列与随机采样 #1 df = DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(5)
for r in range(repeats): print("For iteration: " + str(r)) # fit the model lstm_model = fit_lstm(train_scaled, 1, 3000, 4) # forecast the entire training dataset to build up state for forecasting train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1) lstm_model.predict(train_reshaped, batch_size=1) # walk-forward validation on the test data predictions = list() for i in range(len(test_scaled)): # make one-step forecast X, y = test_scaled[i, 0:-1], test_scaled[i, -1] yhat = forecast_lstm(lstm_model, 1, X) # invert scaling yhat = invert_scale(scaler, X, yhat) # invert differencing yhat = inverse_difference(raw_values, yhat, len(test_scaled) + 1 - i) # store forecast predictions.append(yhat) # report performance rmse = sqrt(mean_squared_error(raw_values[-12:], predictions)) print('%d) Test RMSE: %.3f' % (r + 1, rmse)) error_scores.append(rmse) # summarize results results = DataFrame() results['rmse'] = error_scores print(results.describe()) results.boxplot() pyplot.show()
#2 data = np.random.randn(1000) # Normally distributed # qcut 是和cut擦汗不多,是根据数据的分位数划分数据 cats = pd.qcut(data, 4) # Cut into quartiles cats pd.value_counts(cats) pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) ###检测和过滤异常值 #1随机生成符合正态分布的随机数组 np.random.seed(12345) data = DataFrame(np.random.randn(1000, 4)) # 查看数据的分布状况 data.describe() col = data[3] # 找出绝对值大于3 的值 col[np.abs(col) > 3] data[(np.abs(data) > 3).any(1)] #2 .sign 是符号判断函数 data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() ###排列与随机采样 #1 随机产生一个dataFrame df = DataFrame(np.arange(5 * 4).reshape((5, 4))) # 对数据进行排序
print dict_of_lists,'\n' #convert the list of values for each key in dictionary to Series countries = Series(dict_of_lists['countries']) gold = Series(dict_of_lists['gold']) silver = Series(dict_of_lists['silver']) bronze= Series(dict_of_lists['bronze']) #construct a dictionary of Series' that can be turned in to a DataFrame medal_tally_dict = {'countries' : countries, 'gold': gold, 'silver': silver, 'bronze': bronze} df = DataFrame(medal_tally_dict) print df,'\n' #DataFrame and Series properties print df[['countries','gold']],'\n' print df.describe(),'\n' print 'The gold series in the dataframe is of dtype : ',df['gold'].dtype print 'Number countries : ',len(df['countries']) print 'The mean of golds won where bronze medals greater than 5 : ',df['gold'][df['bronze']>=2].mean() print 'Mean of gold and bronze medal counts : ',(df['gold']+df['silver']).mean() print 'Mean of golds : ',df['gold'].mean() print 'Mean of bronzes : ',df['bronze'].mean() print 'Max number of golds won by a country : ',df['gold'].max() print 'Sum of golds won by all countries : ',df['gold'].sum() # WIP def standardize_data(values): standardized_values = (values - values.mean() ) / values.std() print standardized_values,'\n' with open('C:\omnica\data\managers1.csv','w') as csvfile:
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = list(range(2)) df = DataFrame(arr, columns=["A", "A"]) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range("20130101", periods=4, freq="Q-NOV") df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["foo", "bar", "foo", "hello"], ) df["string"] = "bah" expected = DataFrame( [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], columns=["foo", "bar", "foo", "hello", "string"], ) check(df, expected) with pytest.raises(ValueError, match="Length of value"): df.insert(0, "AnotherColumn", range(len(df.index) - 1)) # insert same dtype df["foo2"] = 3 expected = DataFrame( [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) check(df, expected) # set (non-dup) df["foo2"] = 4 expected = DataFrame( [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) check(df, expected) df["foo2"] = 3 # delete (non dup) del df["bar"] expected = DataFrame( [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], columns=["foo", "foo", "hello", "string", "foo2"], ) check(df, expected) # try to delete again (its not consolidated) del df["hello"] expected = DataFrame( [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) check(df, expected) # consolidate df = df._consolidate() expected = DataFrame( [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) check(df, expected) # insert df.insert(2, "new_col", 5.0) expected = DataFrame( [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], columns=["foo", "foo", "new_col", "string", "foo2"], ) check(df, expected) # insert a dup with pytest.raises(ValueError, match="cannot insert"): df.insert(2, "new_col", 4.0) df.insert(2, "new_col", 4.0, allow_duplicates=True) expected = DataFrame( [ [1, 1, 4.0, 5.0, "bah", 3], [1, 2, 4.0, 5.0, "bah", 3], [2, 3, 4.0, 5.0, "bah", 3], ], columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], ) check(df, expected) # delete (dup) del df["foo"] expected = DataFrame( [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]], columns=["new_col", "new_col", "string", "foo2"], ) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame( [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], columns=["foo", "bar", "foo", "hello"], ) check(df) df["foo2"] = 7.0 expected = DataFrame( [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], columns=["foo", "bar", "foo", "hello", "foo2"], ) check(df, expected) result = df["foo"] expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) check(result, expected) # multiple replacements df["foo"] = "string" expected = DataFrame( [ ["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0], ], columns=["foo", "bar", "foo", "hello", "foo2"], ) check(df, expected) del df["foo"] expected = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( { "RT": [0.0454], "TClose": [22.02], "TExg": [0.0422] }, index=MultiIndex.from_tuples([(600809, 20130331)], names=["STK_ID", "RPT_Date"]), ) df5 = DataFrame( { "RPT_Date": [20120930, 20121231, 20130331], "STK_ID": [600809] * 3, "STK_Name": ["饡驦", "饡驦", "饡驦"], "TClose": [38.05, 41.66, 30.01], }, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=["STK_ID", "RPT_Date"], ), ) k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True) result = k.rename(columns={ "TClose_x": "TClose", "TClose_y": "QT_Close" }) str(result) result.dtypes expected = DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], columns=[ "RT", "TClose", "TExg", "RPT_Date", "STK_ID", "STK_Name", "QT_Close", ], ).set_index(["STK_ID", "RPT_Date"], drop=False) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]) msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar", "foo"]) # drop df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]) result = df.drop(["a"], axis=1) expected = DataFrame([[1], [1], [1]], columns=["bar"]) check(result, expected) result = df.drop("a", axis=1) check(result, expected) # describe df = DataFrame( [[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["bar", "a", "a"], dtype="float64", ) result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame( np.random.randn(5, 3), index=["a", "b", "c", "d", "e"], columns=["A", "B", "A"], ) for index in [df.index, pd.Index(list("edcba"))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame( { "A": expected_ser, "B": this_df["B"], "A": expected_ser }, columns=["A", "B", "A"], ) this_df["A"] = index check(this_df, expected_df) # operations for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ["A", "A"] df.columns = ["A", "A"] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=["that", "that"]) expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) df["that"] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=["that", "that"]) expected = DataFrame(1, index=range(5), columns=["that", "that"]) df["that"] = 1 check(df, expected)
return df df = df.apply(plus, axis=1, args=( 2, 3, )) print(df) # 数据统计 # describe() 函数是个统计大礼包,可以快速让我们对数据有个全面的了解 df1 = DataFrame({ 'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c', 'd'], 'data1': range(6) }) print(df1, df1.describe()) # 数据表合并 df1 = DataFrame({ 'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c'], 'data1': range(5) }) df2 = DataFrame({ 'name': ['ZhangFei', 'GuanYu', 'A', 'B', 'C'], 'data2': range(5) }) # 1. 基于指定列进行连接 df3 = pd.merge(df1, df2, on='name') print(df3) # 2. inner 内连接 # 内链接是 merge 合并的默认情况,inner 内连接其实也就是键的交集,在这里 df1, df2 相同的键是 name
df.idxmax() # 计算每一列最大值的索引 ''' one b two b ''' print df.cumsum() # 每一列的累加和 ''' one two a 1.0 NaN b 8.0 4.0 c NaN NaN d 8.0 5.0 ''' print df.describe() # 对DataFrame每列计算汇总统计 ''' one two count 3.000000 2.00000 mean 2.666667 2.50000 std 3.785939 2.12132 min 0.000000 1.00000 25% NaN NaN 50% NaN NaN 75% NaN NaN max 7.000000 4.00000 ''' obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c']) print obj.describe() # 对Series计算汇总统计 '''
print() #limits - Define the number of bins (Categories) you want to create using the precision parameter and the 2nd argument. print (pd.cut(prime_nos,3,precision=1)) print() #--------- NEXT LECTURE ---------# #Observations on a dataframe - what you do when given a dataframe - initial view df = DataFrame(np.random.randn(1000,5)) #basic observation print (df.head()) #Returns the top 5 rows including dataframe header print (df.tail()) #Returns the last 5 rows print (df.describe()) #Gives basic statistical info about the data of the dataframe column = df[0] print (column.head()) print() print (column[np.abs(column)>3]) print() print (df([(np.abs(df)>3).any(1)])) df[(np.abs(df)>3)] = np.sign(df)*5 #print (df.describe())
pyplot.show() # fit model model = ARIMA(series, order=(5,1,0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) # http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.predict.html X = series.values size = int(len(X) * 0.66) train, test = X[0:size], X[size:len(X)] history = [x for x in train] predictions = list() for t in range(len(test)): model = ARIMA(history, order=(5,1,0)) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] predictions.append(yhat) obs = test[t]
import numpy as np randn = np.random.randn import pandas as pd from pandas import Series, DataFrame np.random.seed(12345) data = DataFrame(np.random.randn(1000, 4)) print data.describe() col = data[3] print col[np.abs(col) > 3] print data[(np.abs(data) > 3).any(1)] #To select all rows having a value exceeding 3 or -3, you can use the any method on a boolean DataFrame: data[np.abs(data) > 3] = np.sign(data) * 3 print data.describe()
p=DataFrame(predicted_probs) # In[186]: p.shape # In[187]: p.head(2) # In[188]: p.describe() # In[189]: get_ipython().magic(u'pinfo lr.predict_proba') # In[190]: p1=p[0] p2=p[1] # In[192]:
df df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'], for tic, data in all_data.iteritems()}) price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) price volume = DataFrame({tic: data['Volume']
def descriptiveStatsDataFrame(): df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two']) print (df) print ('Column Sum: \n{}'.format(df.sum(axis=0))) print ('Row Sum: \n{}'.format(df.sum(axis=1))) print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False))) print ('Index with min Value: \n{}'.format(df.idxmin())) print ('Summary Statistic: \n{}'.format(df.describe()))