for z in xrange(nz): dum = np.ma.masked_where(da.altitude.values < (lev[z] - 1.0), np.arange(da.altitude.shape[0])) dum = np.ma.masked_where(da.altitude.values > (lev[z] + 1.0), dum) tuples_ls.append(slice(dum.compressed()[0], dum.compressed()[-1])) vertlevels = [ da.isel(altitude=i).groupby_bins( 'lat', np.arange(lat_b1, lat_b2, lat_step), labels=(np.arange(lat_b1, lat_b2, lat_step) + lat_step / 2)[:-1]).mean() for i in tuples_ls ] xavgvel = xr.concat(vertlevels, 'altitude') xavgvel['altitude'] = lev xavgvel = xavgvel.rename({'lat_bins': 'lat'}) xavgvel_ds = xavgvel.to_dataset(name='ta') xavgvel_ds.to_netcdf('SABER_temp_' + in_year + month_converter(in_month) + '_binned.nc') #sys.exit() in_year_arr = np.array(in_year_ls) s_year = str(np.min(in_year_arr)) e_year = str(np.max(in_year_arr)) ds = xr.open_mfdataset(infiles, concat_dim='time') ds['time'] = pd.data_range(start=s_year + '01', periods=ds.time.shape[0], freq='M') ds.to_netcdf('SABER_temp_' + s_year + '01' + '_' + e_year + '12_binned_zm.nc')
data_list = np.asarray(data_list) # Selecting date and close value for each day selected_data = data_list[:, [0, 4, 6]] df = pd.DataFrame(data = selected_data[0:,1:], index = selected_data[0:,0], columns = ['close', 'adj close'], dtype = 'float64' ) # Reference for pandas interpolation http://pandas.pydata.org/pandas-docs/stable/missing_data.html # Adding missing dates to the dataframe df1 = df idx = pd.data_range('12-29-2006', '12-31-2016') df1.index = pd.DatatimeIndex(df1.index) df1 = df1.reindex(idx, fill_value=np.NaN) # df1.count gives 2518 interpolated_df = df1.interpolate() interpolated_df.count() # gives 3651 count # Removing extra date rows added in data for calculating interpolation interpolated_df = interpolated_df[3:] date_format = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S+%f"] def try_parsing_data(text): for fmt in data_format: # return datetime.strptime(text, fmt) try:
df = pd.read_csv("./911.csv") #获取分类 #print()df["title"].str.split(": ") temp_list = df["title"].str.split(": ").tolist() cate_list = [i[0] for i in temp_list] cate_df = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1)), columns=["cate"]) print(cate_df) #df["cate"] = cate_df print(df.groupby(by="cate").count()["title"]) #时间序列 pd.data_range(start=None, end=None, periods=None, freq='D') ''' periods 时期 每段时间 D--day M--month 每月最后一天 MS 每月第一天 BMS 每月第一个工作日 B工作日 H每小时 T/min 每分 S 每秒 L 每毫秒 BM 每月最后一个工作日 start和end以及freq配合能够生成start和end范围内以频率freq的一组时间索引 start和periods以及freq配合能够生成以start开始的频率为freq的periods个时间索引 df = pd.DataFrame(np.random.rand(10),index=index) df["timeStamp"] = pd.to_datetime(df["timeStamp"],format="") format参数大部分情况可以不用写,对于pandas无法格式化的时间字符串可以用该参数,不i如包含中文 百度 python时间格式化 '''
print(d) d = c.interpolate(method="time") print(d) new_c1 = c.dropna() print(new_c1) new_c1 = c.dropna(how="all") print(new_c1) new_c1 = c.dropna(thresh=2) print(new_c1) print(c) dt = pd.data_range("01-01-2017", "01-11-2017") idx = pd.DatatimeIndex(dt) c = c.reindex(idx) print(c) new_c = c.fillna(method="ffill", limit=1) print(new_c) new_c = c.fillna(method="bfill") print(new_c) new_c = c.fillna(method="bfill", axis="columns") print(new_c) c = pd.read_csv('F:/freelencing projects/pandas_tutorial/fillna.csv', parse_dates=["day"])
print(df.loc[:,['A','B']]) # 选择所有行和A,B两列 print(df.loc['20180101', ['A', 'B']]) # 只选择20180101这一行的A,B两列;注意,loc都是标签,所以都是用标签去选择 #### select by position: iloc print(df.iloc[3]) # 选择第三行的数据 print(df.iloc[3,0]) # 选择第三行第一列的数据 print(df.iloc[3:5, 1:3]) #切片筛选 print(df.iloc[[1,3,5],1:3]) #非连续的逐个筛选 #### mixed select by label & position:ix print(df.ix[3:, ['A', 'C']]) # 行是用position选,列是用label选的mixed select #### Boolean indexing select print(df[df.A > 8]) #筛选所有A列数值大于8的dataframe ### 使用isin()方法过滤在指定列中的数据 df[df['high'].isin([0.00, 9.00])] #筛选df数据中high列中是0或者9的数据 ## pandas 设置值 dates = pd.data_range('20180101', periods = 6) df = pd.DataFrame(np.arange(24).reshape((6,4), index = dates, columns = ['A','B','C','D']) ### 用iloc,赋值 df.iloc[2,2] = 111 ## 第3行第3列赋值为111 ### 用loc,赋值 df.loc['20180102', 'B'] = 222 ### 用ix赋值,同理 ### boolean indexing df[df.A > 4] = 0 #所有A列大于4的dataframe,每行每列的数值都变成0 df.A[df.A > 4] = 0 #所有A列数值大于4的A列数值,都变成0,其他列数据不变 df['F'] = np.nan #新增F列,并都设值为NAN df['E'] = pd.Series([1,2,3,4,5,6], index= pd.date_range('20180101', periods = 6)) # 新增E列,要用原来dataframe的相同行标签,这样才能对齐,然后用一个list把每一行的数值传进去 ## 处理缺省值 dates = pd.data_range('20180101', periods = 6) df = pd.DataFrame(np.arange(24).reshape((6,4), index = dates, columns = ['A','B','C','D'])
restult = model.fit() # delta = ARIMAModel.fittedvalues - diff.iloc[:,0] # score = 1 - delta.var()/diff.var() # print(score) print(restult.summary()) # 预测 p = restult.predict('20180531', '20190531', dynamic=True, typ='levels') plt.figure(figsize=(6, 6)) plt.xticks(rotation=45) plt.plot(p) plt.plot(data_train) plt.show() # 将一差处理后的预测数据进还原 def revert(diffValues, *lastValue): for i in range(len(lastValue)): result = [] lv = lastValue[i] for dv in diffValues: lv = dv + lv result.append(lv) diffValues = result return diffValues r = revert(p, 11457) data_index = pd.data_range('2018-05-01', periods=12, freq='M') predict_data = pd.DataFrame({'time': data_index, 'amount_money': r}) prodict_data.to_csv('F:\\zhongying\\prodict_data.csv', index=Flase)
offset.rollforward(now) offset.rollback(now) ts=Series(np.random.randn(20),index=pd.date_range('1/15/2000',periods=20,freq='4d') ) ts.groupby(offset.rollforward).mean() ts.resample('M',how='mean') ts.resample('M').mean() import pytz pytz.common_timezones[-5:] tz=pytz.timezone('US/Eastern') tz rng=pd.date_range('3/9/2-12 9:30',periods=6,freq='D') ts=Series(np.random.randn(len(rng)),index=rng) ts print(ts.index.tz) pd.data_range('3/9/2012 9:30',periods=10, freq='D',tz='UTC') pd.date_range('3/9/2012 9:30',periods=10, freq='D',tz='UTC') ts_utc=ts.tz_localize('UTC') ts ts_utc ts_utc.index print(ts.index.tz) ts_utc.tz_convert('US/Eastern') ts_utc.col ts_utc.index ts_utc.index = ts_utc.index+'10Y' ts_utc.index = ts_utc.index+timedelta(10,0,0) ts_utc.index ts_utc.index = ts_utc.index+timedelta(10,0,0) ts_utc.index ts_utc.index = ts_utc.index+timedelta(10,0,0,0,)
print(df1) df2 = pd.DataFrame({'A':1, 'B':pd.Timestamp('20191018'), 'C':pd.Series(1, index=list(range(4)), dtype='float32'), 'D':np.array([3]*4, dtype='int32'), 'E':pd.Categorical(['test','train','test','train']), 'F':'foo'}) print(df2) print(df2.dtypes) print(df2.index) print(df2.columns) print(df2.values) print(df2.describe()) print(df2.T) print(df2.sort_index(axis=1, ascending=False)) print(df2.sort_values(by='B')) dates = pd.data_range('20191018', periods=Faluse) df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'] ''' ##### print(pd.date_range('20190101', periods=6)) s = pd.Series([1, 2, 3, 4]) print(s) data = pd.DataFrame([[1,3,4], [2,3,4]], index=['a','b'], columns=['a','b','c']) print(data) '''