def pd_dataframe(): data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} frame=DataFrame(data) print frame print DataFrame(data,columns=['year','state','pop']) frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four', 'five']) print frame2 print frame2['state'] print frame2.year print frame2.ix['three'] frame2.debt=16.5 print frame2 frame2['debt']=np.arange(5.) print frame2 val=Series(data=[-1.2,-1.5,-1.7],index=['two','four','five']) frame2['debt']=val print frame2 frame2['eastern']=frame2.state=='Ohio' print frame2 del frame2['eastern'] print frame2.columns
# Name: first_name, dtype: object print("\n") print(df.loc[1]) # loc : index의 location 가지고 특정 위치의 데이터에 접근 print("\n") print(df["age"].iloc[1:]) # iloc: index의 position print("\n") s = pd.Series(np.nan, index=[49,48,47,46,45,1,2,3,4,5]) # np.nan은 값을 NaN 으로 print(s) print("\n") print(s.iloc[3:]) # index의 위치가 3부터 출력 print("\n") print(s.loc[:3]) # index가 3인 곳까지만 출력 print("\n") print(df.age>40) print("\n") df.debt = df.age>40 # colum에 새로운 데이터 할당 print(df) # first_name last_name age city debt # 0 Jason Miller 42 San Francisco True # 1 Molly Jacobson 52 Baltimore True # 2 Tina Ali 36 Miami False # 3 Jake Milner 24 Douglas False # 4 Amy Cooze 73 Boston True print("\n=======================================================\n") values = Series(data=["M", "F", "F"], index=[0,1,3]) print(values) print("\n") df["sex"] = values # sex라는 column만들고 values에 할당된 index의 위치에 값들을 삽입 print(df) print("\n")
df["first_name"] # 데이터 접근 df.loc[1] # loc: 특정 row 접근. index 의 이름을 기준 df.loc[1:2] df.iloc[1:] # index 순서를 기준 import numpy as np s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5]) s.loc[:3] s.iloc[:3] df # 데이터 새로 할당 df.debt = df.age > 40 df df.T # transpose df.values df.to_csv() del df["debt"] df ## Selection & Drop df.head(3) df[["first_data", "age"]].head(3) df[:3] # column 없이 쓰는 경우는 row 기준
columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) frame2 # 2 ways to retrieve a column: frame2['state'] frame2.year ## rows retrieval: frame2.ix['three'] frame2[2:] frame2['debt'] = np.arange(5) frame2 val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2.debt = val frame2 frame2['eastern'] = frame2['state'] == 'Ohio' frame2 ## delete a column del frame2['eastern'] frame2.columns ### passing nested dictionary to DataFrame: pop = { 'Nevada': { 2001: 2.4, 2002: 2.9 },
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print(frame2) print(frame2['state']) print(frame2.year) print("-------------") print(frame2.iloc[1:3, :]) # 1到2行 print("-------------") print(frame2.loc[['three', 'one']]) # 1到2行 ##.ix is deprecated. Please use,.loc for label based indexing or,.iloc for positional indexing print('2----------------') frame2['debt'] = 16.5 # 修改一整列 print(frame2) frame2.debt = np.arange(5) # 用numpy数组修改元素 print(frame2) print() print('用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。') val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print(frame2) print() print('赋值给新列') frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等于Ohio为True print(frame2) print(frame2.columns) print()
def main(): # Series # Series is likea 1d array. lst = [4, 7, -5, 3] obj = Series(lst) print obj print obj.values print obj.index obj2 = Series(copy.deepcopy(lst), index=['d', 'b', 'a', 'c']) print obj2 print obj2.index # numpy ndarray like print obj2['a'] obj2['d'] = 6 print obj2[['c', 'a', 'd']] print obj2 print obj2[obj2 > 0] print obj2 * 2 print np.exp(obj2) # dict like print 'b' in obj2, 'e' in obj2 # dict to Series sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000,} obj3 = Series(sdata) print obj3 # Value is NaN, if index not found. states = ['Calfornia', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index=states) print obj4 # Pandas function print pd.isnull(obj4) print pd.notnull(obj4) # Series method print obj4.isnull() # Value is NaN, Index that does not exist only on one side print obj3 print obj4 print obj3 + obj4 # Index name obj4.name = 'population' obj4.index.name = 'state' print obj4 # Change index obj4.index = ['a', 'b', 'c', 'd'] print obj4 # DataFrame # DataFrame is like a spread sheet(table) # DataFrame of Pandas are similar to R DataFrame print '','' data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9],} frame = DataFrame(data) print frame # Order columns print DataFrame(data, columns=['year', 'state', 'pop']) # Columns, named indexes # Column all value is NaN, if column not found in data frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print frame2 print frame2.columns print frame2['state'] print frame2.year # row data print frame2.ix['three'] # Change column value frame2['debt'] = 16.5 print frame2.debt frame2.debt = np.arange(5.) # error, size is not 5 print frame2.debt # Column value is Nan, if index not found val = Series([-1.2, -1.5, -1.7], index=['one', 'three', 'five']) frame2.debt = val print frame2 # new column. No frame.eastern frame2['eastern'] = frame2.state == 'Ohio' print frame2 # del column. No frame.eastern del frame2['eastern'] print frame2 # nested dict to DataFrame # outer key is column, and sorted # inner key in index, and sorted pop = {'Ohio':{2002: 3.6, 2000: 1.5, 2001:1.7}, 'Nevada': {2001: 2.4, 2002:2.9},} frame3 = DataFrame(pop) print frame3 # transpose print frame3.T # DataFrame is not sorted, if specify index/columns print DataFrame(pop, index=[2001, 2002, 2000], columns=['Ohio', 'Nevada']) # Values method return ndarray, ndarray has one type # return casted type, if ndarray has many type. print frame3.values print frame3.values.dtype print frame2.values print frame2.values.dtype print frame2['year'].values.dtype # Index Object(IntIndex, DateTimeIndex, PeriodIndex) obj = Series(range(3), index='a b c'.split(' ')) index = obj.index print type(index) print index.dtype # Index is immutable try: index[1] = 'd' except: print sys.exc_info() print obj.index is pd.Index(['a', 'b', 'c']) index = pd.Index(np.arange(3)) obj2 = Series(range(3), index=index) print obj2.index is index # Index as a Set print frame3 print type(frame3.columns) print 'Ohio' in frame3.columns print 2003 in frame3.index print index.append(pd.Index(['d'])) print index # result is true, immutable
# result # year 2002 # state good # pop 2.4 # debt NaN # Name: three, dtype: object # x['dept'] = 16.5 #修改整列数据 # result # year state pop debt dept # one 2000 ok 3.7 NaN 16.5 # two 2001 ok 3.6 NaN 16.5 # three 2002 good 2.4 NaN 16.5 # four 2003 bad 0.9 NaN 16.5 x.debt = n.arange(4) # 用numpy数组修改元素 # result # year state pop debt # one 2000 ok 3.7 0 # two 2001 ok 3.6 1 # three 2002 good 2.4 2 # four 2003 bad 0.9 3 val = Series([-1.2, -1.5, -1.7, 0], index=['one', 'two', 'four', 'six']) x.debt = val # 使用Series修改元素,DataFrme的行索引不变 # result # year state pop debt # one 2000 ok 3.7 -1.2 # two 2001 ok 3.6 -1.5 # three 2002 good 2.4 NaN # four 2003 bad 0.9 -1.7
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year' : [2000, 2001, 2002, 2001, 2002], 'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]} frame = DataFrame(data) frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) frame2['state'] frame2.values frame2.year frame2.ix['three'] frame2.ix[3] frame2['debt'] = 16.5 #frame2['debt'] = [16.5, 17, 26] frame2.debt = np.arange(5.) val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2.debt = val frame2['eastern'] = frame2.state =='Ohio' del frame2['eastern'] # dict of dicts into DataFrame pop = {'Nevada': {2001: 2.5, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} frame3 = DataFrame(pop) frame3.index.name = 'year'; frame3.columns.name = 'state' frameTest = DataFrame(frame3) frameTest = DataFrame(frame3, index=[1,2,3]) # pandas Index object
# In[29]: frame2.state # In[30]: frame2.year # In[31]: frame2.loc["three"] # In[32]: frame2.debt = np.arange(5) + 1 frame2 # In[33]: val = Series([-1.2, -1.5, -1.7], index=['two', 'three', 'five']) frame2.debt = val # In[34]: frame2 # In[35]: frame2['eastern'] = frame2.state == 'Ohio'
} frame = DataFrame(data) # print(frame) # print(type(frame.year)) frame2 = DataFrame(data, columns=['year', 'state', 'pop']) frame3 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) # print(frame3['state']) # print(frame3.state) # print(frame3.ix['one']) frame3.debt = 100 frame3.debt = np.arange(5.) # print(np.arange(6.)) # print(type(np.arange(6.))) temp_series = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame3.debt = temp_series frame3['ddd'] = np.nan #추가할때나 지울때는 인덱스 형식으로 해야함 frame3.ddd 안됨 # print(frame3) frame3['eastern'] = frame3.state == 'Ohio' # print(frame3)
raw_data = { 'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 'age': [42, 52, 36, 24, 73], 'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston'] } df = pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city']) print(df) print(DataFrame(raw_data, columns=['first_name', 'age'])) print(df.loc[1]) df_1 = DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city', 'debt']) print(df_1) df_1.debt = df_1.age > 40 print(df_1) print(df_1.values) print(df_1.to_csv()) print(df['first_name'].head(2)) print(df[['first_name', 'last_name']].head(2))
""" data = { 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9] } frame = DataFrame(data) frame DataFrame(data, columns=['year', 'state', 'pop']) frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) frame2['state'] frame2.year frame2.ix['three'] frame2.debt = 16.5 frame2['debt'] = np.arange(5.) frame2.debt = 'NaN' val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val frame2['eastern'] = frame2.state == 'Ohio' del frame2['eastern'] frame2.columns pop = { 'Nevada': { 2001: 2.4, 2002: 2.9 }, 'Ohio': { 2000: 1.5, 2001: 1.7,
print(DataFrame(data)) print(DataFrame(data, columns=['year', 'state', 'pop'])) # designate order print() print("""## Designate index and column, if column not exist, set 'NaN' in this column:""") frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print(frame2) print(frame2['state']) print(frame2.year) print(frame2.ix['three']) frame2['debt'] = 16.5 # modify the whole column using an element print(frame2) frame2.debt = np.arange(5) # modify the whole column using an array print(frame2) print() print("""## Use Series to designate the index and value for modification, the unspecified value set to 'NaN':""") val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print(frame2) print() print("## Assign value to new column:") frame2['eastern'] = (frame2.state == 'Ohio') print(frame2) print(frame2.columns)
'pop':[1.5, 1.7, 3.6, 2.4, 2.9]} print DataFrame(data) print DataFrame(data, columns = ['year', 'state', 'pop']) # 指定列顺序 print print '指定索引,在列中指定不存在的列,默认数据用NaN。' frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five']) print frame2 print frame2['state'] print frame2.year print frame2.ix['three'] frame2['debt'] = 16.5 # 修改一整列 print frame2 frame2.debt = np.arange(5) # 用numpy数组修改元素 print frame2 print print '用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。' val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five']) frame2['debt'] = val print frame2 print print '赋值给新列' frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等于Ohio为True print frame2 print frame2.columns print
# five 2002 Nevada 2.9 NaN print(frame2.year) # one 2000 # two 2001 # three 2002 # four 2001 # five 2002 print(frame2.ix['three']) # 特定インデックス(ixで選択) # year 2002 # state Ohio # pop 3.6 # debt NaN frame2.debt = 16.5 print(frame2) # year state pop debt # one 2000 Ohio 1.5 16.5 # two 2001 Ohio 1.7 16.5 # three 2002 Ohio 3.6 16.5 # four 2001 Nevada 2.4 16.5 # five 2002 Nevada 2.9 16.5 frame2.debt = np.arange(5) print(frame2) # year state pop debt # one 2000 Ohio 1.5 0 # two 2001 Ohio 1.7 1 # three 2002 Ohio 3.6 2 # four 2001 Nevada 2.4 3
'pop': [1.5, 1.7, 3.6, 2.4, 2.9] } print DataFrame(data) print DataFrame(data, columns=['year', 'state', 'pop']) print frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) print frame2 print frame2['state'] print frame2.year print frame2.ix['three'] frame2['debt'] = 16.5 print frame2 frame2.debt = np.arange(5) print frame2 print val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print frame2 print frame2['eastern'] = (frame2.state == 'Ohio') print frame2 print frame2.columns print pop = { 'Nevada': {
'pop': [1.5, 1.7, 3.6, 2.4, 2.9] } frame = DataFrame(data) frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five']) frame2['state'] frame2.values frame2.year frame2.ix['three'] frame2.ix[3] frame2['debt'] = 16.5 #frame2['debt'] = [16.5, 17, 26] frame2.debt = np.arange(5.) val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2.debt = val frame2['eastern'] = frame2.state == 'Ohio' del frame2['eastern'] # dict of dicts into DataFrame pop = { 'Nevada': { 2001: 2.5, 2002: 2.9 }, 'Ohio': { 2000: 1.5, 2001: 1.7, 2002: 3.6