Example #1
0
def pd_02():
    string_data=Series(['a','b','c',np.nan,'e',None])
    print string_data
    print string_data.isnull()
    print string_data.dropna()
    df=DataFrame(np.random.randn(7,3))
    df.ix[:4,1]=np.nan
    df.ix[:2,2]=np.nan
    print df
    print df.dropna()
    print df.fillna(0)
    print df.fillna({1:0.5,3:-1})
    print df
    df.fillna(0,inplace=True)
    print df
Example #2
0
[NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
cleaned = data.dropna() # it drops every row which contains at least one Na value
data.dropna(how='all') # it only drops row with all value equals to NA
data[4] = NA
data
data.dropna(axis=1, how='all')
## Filling in Missing Data
df
df.fillna(0)
df = DataFrame(np.random.randn(7, 3))
df
df.ix[:4,1]
df.ix[:4,1] = NA
df.ix[:4,1]
df.ix[:2, 2] = NA
df
df.dropna(thresh=3)
df
df.fillna({1: 0.5, 3:-1})
df
_ = df.fillna(0, inplace=True)
df
a = 5
_ = a +3
a
df = DataFrame(np.random.randn(6, 3))
df.ix[2: ,1] = NA
df.ix[4: ,2] = NA
Example #3
0
data_nan_dataframe = DataFrame([[1., 6.5, 3.],
                                [1., np.nan, np.nan],
                                [np.nan, np.nan, np.nan],
                                [np.nan, 6.5, 3]])
# print data_nan_dataframe
# 默认删掉所有包含NA的行
# print data_nan_dataframe.dropna()
# all只删掉全部为NA的行
# print data_nan_dataframe.dropna(how='all')
# 可以指定axis
# print data_nan_dataframe.dropna(axis=1)

# 对于时间序列
# 如果只是想留下一部分观测数据, 可以使用thresh参数来实现目的
data_nan_dataframe_time = DataFrame(np.random.randn(7, 3))
data_nan_dataframe_time.ix[:4, 1] = np.nan
data_nan_dataframe_time.ix[:2, 2] = np.nan
# print data_nan_dataframe_time
# print data_nan_dataframe_time.dropna(thresh=3)
# print data_nan_dataframe_time.dropna()


# 填充缺失数据

# 使用fillna来填充缺失值
# print data_nan_dataframe_time.fillna(0)
# 可以传递字典给fillna, 对于不同列填充不同的值
# print data_nan_dataframe_time.fillna({1: 0.5, 3: -1})
# fillna 默认是返回一个新的对象,不对原对象进行修改
# 设置inplace来选择是否对原对象进行修改
_ = data_nan_dataframe_time.fillna(0, inplace=True)
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
'''
data[0] = NA
print
data.dropna(axis=1, how='all')  # 某行有NA就全部删除
'''
     1    2
0  6.5  3.0
1  NaN  NaN
2  NaN  NaN
3  6.5  3.0
'''
data = DataFrame(numpy.arange(21).reshape(7, 3))
data.ix[:4, 1] = NA
data.ix[:2, 2] = NA
print
data
'''
    0     1     2
0   0   NaN   NaN
1   3   NaN   NaN
2   6   NaN   NaN
3   9   NaN  11.0
4  12   NaN  14.0
5  15  16.0  17.0
6  18  19.0  20.0
'''
print
data.dropna(thresh=2)  # 每行至少要有2个非NA元素则删除
data=Series([1,NA,3.5,NA,7])

data.dropna()
#或者
data[data.notnull()]


data=DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])


#丢弃 全为NA的 行
data.dropna(how='all')


df=DataFrame(np.random.randn(7,3))
df.ix[:4,1]=NA;df.ix[:2,2]=NA

df.dropna(thresh=3)


#填补 缺失 的 数据

#返回新对象
df.fillna(0)
#对现有对象 进行 修改:
_=df.fillna(0,inplace=True)
#********************************************************


from pandas import Series,DataFrame
import pandas as pd
Example #6
0
# Correlation and Covariance
df = DataFrame(np.random.randn(100,3), columns=list('abc'))
df.corr() 
df.cov()
df.corrwith(df['a'])

# unique values, value counts, membership
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
obj.value_counts()
mask = obj.isin(['b','c'])
obj[mask]

# deal with missing data
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1] = np.nan; df.ix[:2,2] = np.nan

df.dropna(thresh=3)
df.fillna(0)
df.fillna({1:0.5,3:-1})
df.fillna(method='bfill')
df.fillna(method='bfill', limit=2)


# read and write data 
'''
pd.read_csv(file_path, sep=',', header=None, index_col, names, skiprows, na_value, nrows, chunksize)
pd.to_csv(sys.stdout, na_rep='NULL', index=False, header=False)
'''

# manually working with delimited formats