Example #1
0
    for z in xrange(nz):
        dum = np.ma.masked_where(da.altitude.values < (lev[z] - 1.0),
                                 np.arange(da.altitude.shape[0]))
        dum = np.ma.masked_where(da.altitude.values > (lev[z] + 1.0), dum)
        tuples_ls.append(slice(dum.compressed()[0], dum.compressed()[-1]))

    vertlevels = [
        da.isel(altitude=i).groupby_bins(
            'lat',
            np.arange(lat_b1, lat_b2, lat_step),
            labels=(np.arange(lat_b1, lat_b2, lat_step) +
                    lat_step / 2)[:-1]).mean() for i in tuples_ls
    ]

    xavgvel = xr.concat(vertlevels, 'altitude')
    xavgvel['altitude'] = lev
    xavgvel = xavgvel.rename({'lat_bins': 'lat'})
    xavgvel_ds = xavgvel.to_dataset(name='ta')
    xavgvel_ds.to_netcdf('SABER_temp_' + in_year + month_converter(in_month) +
                         '_binned.nc')
    #sys.exit()

in_year_arr = np.array(in_year_ls)
s_year = str(np.min(in_year_arr))
e_year = str(np.max(in_year_arr))
ds = xr.open_mfdataset(infiles, concat_dim='time')
ds['time'] = pd.data_range(start=s_year + '01',
                           periods=ds.time.shape[0],
                           freq='M')
ds.to_netcdf('SABER_temp_' + s_year + '01' + '_' + e_year + '12_binned_zm.nc')
Example #2
0
data_list = np.asarray(data_list)

# Selecting date and close value for each day
selected_data = data_list[:, [0, 4, 6]]

df = pd.DataFrame(data = selected_data[0:,1:],
    index = selected_data[0:,0],
    columns = ['close', 'adj close'],
    dtype = 'float64'
)

# Reference for pandas interpolation http://pandas.pydata.org/pandas-docs/stable/missing_data.html
# Adding missing dates to the dataframe
df1 = df
idx = pd.data_range('12-29-2006', '12-31-2016')
df1.index = pd.DatatimeIndex(df1.index)
df1 = df1.reindex(idx, fill_value=np.NaN)
# df1.count gives 2518
interpolated_df = df1.interpolate()
interpolated_df.count() # gives 3651 count

# Removing extra date rows added in data for calculating interpolation
interpolated_df = interpolated_df[3:]

date_format = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S+%f"]

def try_parsing_data(text):
    for fmt in data_format:
        # return datetime.strptime(text, fmt)
        try:
Example #3
0
df = pd.read_csv("./911.csv")

#获取分类
#print()df["title"].str.split(": ")
temp_list = df["title"].str.split(": ").tolist()
cate_list = [i[0] for i in temp_list]
cate_df = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1)),
                       columns=["cate"])
print(cate_df)

#df["cate"] = cate_df
print(df.groupby(by="cate").count()["title"])

#时间序列
pd.data_range(start=None, end=None, periods=None, freq='D')
'''  periods  时期 每段时间
D--day
M--month  每月最后一天  MS 每月第一天  BMS 每月第一个工作日
B工作日
H每小时
T/min 每分    S 每秒    L 每毫秒   BM 每月最后一个工作日
start和end以及freq配合能够生成start和end范围内以频率freq的一组时间索引
start和periods以及freq配合能够生成以start开始的频率为freq的periods个时间索引


df = pd.DataFrame(np.random.rand(10),index=index)
df["timeStamp"] = pd.to_datetime(df["timeStamp"],format="")
format参数大部分情况可以不用写,对于pandas无法格式化的时间字符串可以用该参数,不i如包含中文
百度  python时间格式化
'''
Example #4
0
print(d)

d = c.interpolate(method="time")
print(d)

new_c1 = c.dropna()
print(new_c1)

new_c1 = c.dropna(how="all")
print(new_c1)

new_c1 = c.dropna(thresh=2)
print(new_c1)

print(c)
dt = pd.data_range("01-01-2017", "01-11-2017")
idx = pd.DatatimeIndex(dt)
c = c.reindex(idx)
print(c)

new_c = c.fillna(method="ffill", limit=1)
print(new_c)

new_c = c.fillna(method="bfill")
print(new_c)

new_c = c.fillna(method="bfill", axis="columns")
print(new_c)

c = pd.read_csv('F:/freelencing projects/pandas_tutorial/fillna.csv',
                parse_dates=["day"])
Example #5
0
print(df.loc[:,['A','B']]) # 选择所有行和A,B两列
print(df.loc['20180101', ['A', 'B']]) # 只选择20180101这一行的A,B两列;注意,loc都是标签,所以都是用标签去选择
#### select by position: iloc
print(df.iloc[3]) # 选择第三行的数据
print(df.iloc[3,0]) # 选择第三行第一列的数据
print(df.iloc[3:5, 1:3]) #切片筛选
print(df.iloc[[1,3,5],1:3]) #非连续的逐个筛选
#### mixed select by label & position:ix
print(df.ix[3:, ['A', 'C']]) # 行是用position选,列是用label选的mixed select
#### Boolean indexing select
print(df[df.A > 8]) #筛选所有A列数值大于8的dataframe
### 使用isin()方法过滤在指定列中的数据
df[df['high'].isin([0.00, 9.00])] #筛选df数据中high列中是0或者9的数据

## pandas 设置值
dates = pd.data_range('20180101', periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4), index = dates, columns = ['A','B','C','D'])
### 用iloc,赋值
df.iloc[2,2] = 111 ## 第3行第3列赋值为111
### 用loc,赋值
df.loc['20180102', 'B'] = 222
### 用ix赋值,同理
### boolean indexing
df[df.A > 4] = 0 #所有A列大于4的dataframe,每行每列的数值都变成0
df.A[df.A > 4] = 0 #所有A列数值大于4的A列数值,都变成0,其他列数据不变
df['F'] = np.nan #新增F列,并都设值为NAN
df['E'] = pd.Series([1,2,3,4,5,6], index= pd.date_range('20180101', periods = 6)) # 新增E列,要用原来dataframe的相同行标签,这样才能对齐,然后用一个list把每一行的数值传进去

## 处理缺省值
dates = pd.data_range('20180101', periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4), index = dates, columns = ['A','B','C','D'])
Example #6
0
restult = model.fit()
# delta = ARIMAModel.fittedvalues - diff.iloc[:,0]
# score = 1 - delta.var()/diff.var()
# print(score)
print(restult.summary())
# 预测
p = restult.predict('20180531', '20190531', dynamic=True, typ='levels')
plt.figure(figsize=(6, 6))
plt.xticks(rotation=45)
plt.plot(p)
plt.plot(data_train)
plt.show()


# 将一差处理后的预测数据进还原
def revert(diffValues, *lastValue):
    for i in range(len(lastValue)):
        result = []
        lv = lastValue[i]
        for dv in diffValues:
            lv = dv + lv
            result.append(lv)
        diffValues = result
    return diffValues


r = revert(p, 11457)
data_index = pd.data_range('2018-05-01', periods=12, freq='M')
predict_data = pd.DataFrame({'time': data_index, 'amount_money': r})
prodict_data.to_csv('F:\\zhongying\\prodict_data.csv', index=Flase)
Example #7
0
offset.rollforward(now)
offset.rollback(now)
ts=Series(np.random.randn(20),index=pd.date_range('1/15/2000',periods=20,freq='4d')
)
ts.groupby(offset.rollforward).mean()
ts.resample('M',how='mean')
ts.resample('M').mean()
import pytz
pytz.common_timezones[-5:]
tz=pytz.timezone('US/Eastern')
tz
rng=pd.date_range('3/9/2-12 9:30',periods=6,freq='D')
ts=Series(np.random.randn(len(rng)),index=rng)
ts
print(ts.index.tz)
pd.data_range('3/9/2012 9:30',periods=10, freq='D',tz='UTC')
pd.date_range('3/9/2012 9:30',periods=10, freq='D',tz='UTC')
ts_utc=ts.tz_localize('UTC')
ts
ts_utc
ts_utc.index
print(ts.index.tz)
ts_utc.tz_convert('US/Eastern')
ts_utc.col
ts_utc.index
ts_utc.index = ts_utc.index+'10Y'
ts_utc.index = ts_utc.index+timedelta(10,0,0)
ts_utc.index
ts_utc.index = ts_utc.index+timedelta(10,0,0)
ts_utc.index
ts_utc.index = ts_utc.index+timedelta(10,0,0,0,)
Example #8
0
print(df1)

df2 = pd.DataFrame({'A':1,
                    'B':pd.Timestamp('20191018'),
                    'C':pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D':np.array([3]*4, dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'foo'})
print(df2)
print(df2.dtypes)
print(df2.index)
print(df2.columns)
print(df2.values)
print(df2.describe())
print(df2.T)
print(df2.sort_index(axis=1, ascending=False))
print(df2.sort_values(by='B'))

dates = pd.data_range('20191018', periods=Faluse)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D']
'''
#####
print(pd.date_range('20190101', periods=6))

s = pd.Series([1, 2, 3, 4])
print(s)

data = pd.DataFrame([[1,3,4], [2,3,4]], index=['a','b'], columns=['a','b','c'])
print(data)
'''