def to_datetime(self, time_feats): """ df is the DataFrame time_feats is a list of the features to be changed to datetime format """ for col in self.data: if col in time_feats: self.data[col] = pd.to_datatime(self.data[col])
def get_fremont_data(filename='Fremont.csv', url=FREMONT_URL, force_download=False): """ Download and cache fremont data """ if force_download or not os.path.exists(filename): print("downloading data") urlretrieve(URL, filename) data = pd.read_csv('Fremont.csv', index_col='Date') try: data.index = pd.to_datetime(data.index, format='%m/%d/%Y %H:%M:%S %p') except TypeError: data.index = pd.to_datatime(data.index) data.columns = ['West', 'East'] data['Total'] = data['West'] + data['East'] return data
def get_fremont_data(filename=FREMONT_FILE, url=FREMONT_URL, force_download=False): """Download and cache the Fremont bike data Parameters ---------- filename: string (optional) Location of the downloaded file url: string (optional) Web location of the data force_download: bool (optional) if True, force re-download of the data Returns ------- data: pandas.DataFrame The Fremont bridge data index: Date (datetime) columns: East, West, Total (number) """ if force_download or not os.path.exists(filename): urlretrieve(url, 'fremont.csv') cols = [ "Date", "Fremont Bridge East Sidewalk", "Fremont Bridge West Sidewalk" ] data = pd.read_csv('fremont.csv', index_col='Date', usecols=cols) try: data.index = pd.to_datetime(data.index, format='%m/%d/%Y %I:%M:%S %p') except TypeError: data.index = pd.to_datatime(data.index) data.columns = ['East', 'West'] data['Total'] = data['East'] + data['West'] return data
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import calendar import numpy as np crimes = pd.read_csv("Chicago_Crimes_2012_to_2017.csv") crimes.head(2) crimes = crimes[(crimes['Year'] == 2016) | (crimes['Year'] == 2015)] crimes['Date'] = pd.to_datatime(crimes['Date'], format='%m/%d/%Y %I:%M:%S %p') crimes['Month'] = ( crimes['Date'].dt.month).apply(lambda x: calendar.month_abbr[x]) crimes['Month'] = pd.Categorical(crimes['Month'], categories=[ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ], ordered=True) crimes["Weekday"] = crimes['Data'].dt.weekday_name crimes['Weekday'] = pd.Categorical(crimes['Weekday'], categories=[ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ], ordered=True) crimes.head()
import pandas as pd import plotly.offline as pyo import plotly.graph_objs as go # Load CSV file from Datasets folder df = pd.read_csv('../Datasets/CoronaTimeSeries.csv') df['Date'] = pd.to_datatime(df['Date']) # Preparing data trace1 = go.Scatter(x=df['Date'], y=df['Death'], mode='lines', name='Death') trace2 = go.Scatter(x=df['Date'], y=df['Recovered'], mode='lines', name='Recovered') trace3 = go.Scatter(x=df['Date'], y=df['Unrecovered'], mode='lines', name='Unrecovered') data = [trace1, trace2, trace3] # Preparing layout layout = go.Layout( title= 'Corona Virus Death and Recovered Cases From 2020-01-22 to 2020-03-17', xaxis_title="Date", yaxis_title="Number of cases") # Plot the figure and saving in a html file fig = go.Figure(data=data, layout=layout) pyo.plot(fig, filename='multilinechart.html')
def process_log_file(cur, filepath): # open log file df = pd.read_json(filepath, line=True) # filter by NextSong action df = df[df['page'] == 'NextSong'] # convert timestamp column to datetime t = pd.to_datetime(df['ts'], unit='ms') # insert time data records time_data = pd.concat([ t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday ], axis=1 ) column_labels = [ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday' ] time_df = pd.DataFrame(dat=time_data.values, columns=column_labels) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df[['userID', 'firstName', 'lastName', 'gender', 'level']] # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = ( pd.to_datatime(row.ts, unit='ms'), row.userID, row.level, songid, artistid, row.sessionId, row.location, row.userAgent ) cur.execute(songplay_table_insert, songplay_data)
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV #1.obtain dataset data = pd.read_csv("~~") #2.deal data #a. reduce the range of data data = data.query("x<2.5&x>2.0&y<1.5&y>1.0") #2.0<x<2.5 #1.0<y<1.5 #b. time->specific time_value = pd.to_datatime(data["time"], unit="s") print(time_value.values) date = pd.DatatimeIndex(time_value) data["day"] = date.day data["weekday"] = date.weekday data["hour"] = date.hour #c.filter the few checkin places place_count = data.groupby("place_id").count()["row.id"] place_count[place_count > 3] data_final = data["place_id"].isin(place_count[place_count > 3].index.values) #feature value + target value x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]] y = data_final["place_id"] #3.standard scaler
import datetime import matplotlib.pyplot as plt import statsmodels.api as sm from datetime import datetime from statsmodels.tsa.stattolls import adfuller #ADF检验 from statsmodels.stats.diagnostic import acorr_ljungbox #白噪声检验 from statsmodels.graphics.tsplots import plot_acf,plot_pacf #画图定阶 from statsmodels.tsa.armima_model import ARIMA from statsmodels.tsa.armima_model import ARMA df = pd.read_csv('',encoding=''utf-8', index_col='') #1 dataframe = pd.DataFrame({'time':df[],'values':df[]}) df['time'] = pd.to_datatime(df['time']) #将默认索引转化为时间索引 df.set_index("time", inplace=True) #2 df.index = pd.to_datetime(df.index) ts = df[] #差分法实现数据平稳性 def stationarity(timeseries): diff1 = timeseries.diff(1).dropna() diff2 = diff1.diff(1) diff1.plot(color = 'red',title= 'diff 1',figsize=(10,4)) diff2.plot(color = 'black',title= 'diff 2',figsize=(10,4)) #ADF检验 x = np.array(diff1['value']) #看数据放一阶差分还是二阶 adftest = adfuller(x,autolag = 'AIC'=)
Age20.plot(kind='hist',bins=80,rwidth=0.8,alpha=0.3,range=(0,80)) plt.show() # 时间序列 test = pd.read_csv('./test.csv',parse_dates=True, index_col='Date')#把时间当标识 test = test.drop(columns='Unnamed: 0')#去掉默认的标识 test.loc['2019-08-01 08:00:00','Age'] #定位 test.loc['2019'] test.loc['2019-Oct-01'] #用了标准的时间,用oct这种也是可以的 time = pd.to_datatime(['2019-08-01 08:00','2019-08-01 20:00'])#string转成标准格式 time test.reindex(time) #用变了之后的格式重新排列行,如果time有test没有的会NaN很方便 test.reindex(time,method='ffill') #跟上一条一样,NaN的填补 test.reindex(time,method='ffill') #跟下一条一样 # 时间resample,重新改变数据的粒度 daily_mean=test.resample('D').mean #按day,如果没有改天就NaN daily_mean=test.resample('M').mean #按month daily_mean=test.resample('6W').mean #每6周 # 其他 test['name'].str.upper() test['name'].str.contains('i')#包含i test['Date'].dt.hour #得到时
def Temperature(): df_GreenhouseGas = pd.read_csv('GreenhouseGas.csv','r',header = 0) df_GlobalSurfaceTemperature = pd.read_csv('GlobalSurfaceTemperature.csv','r',header=0) df_CO2ppm = pd.read_csv('CO2ppm.csv','r') df_GreenhouseGas_new = pd.DataFrame(df_GreenhouseGas[['N2O', 'CH4', 'CO2']].values,index=pd.to_datatime(df_GreenhouseGas['Year'].astype(str)),columns = ['N2O','CH4', 'CO2']) df_GlobalSurfaceTemperature_new = pd.DataFrame(df_GlobalSurfaceTemperature[['Median','Upper','Lower']].values,index = pd.to_datetime(df_GlobalSurfaceTemperature['Year'].astype(str)),columns=['Median','Upper', 'Lower']) df_CO2ppm_new = pd.DataFrame(df_CO2ppm.iloc[:, 1].values,index = pd.to_datetime(df_CO2ppm['Year'].astype(str)),columns=['CO2_PPM']) df_merge = pd.concat([df_GreenhouseGas_new,df_CO2ppm_new,df_GlobalSurfaceTemperature_new],axis=1) feature = df_merge.iloc[:, 0:4].fillna(method='ffill').fillna(method='bfill') feature_train = feature['1970-01-01':'2010-01-01'] feature_test = feature['2011-01-01':'2017-01-01'] target_Median = df_merge.iloc[:, 4] target_Median_train = target_Median['1970-01-01':'2010-01-01'] model_Median = LinearRegression() model_Median.fit(feature_train, target_Median_train) MedianPredict = model_Median.predict(feature_test) target_Upper = df_merge.iloc[:, 5] target_Upper_train = target_Upper['1970-01-01':'2010-01-01'] model_Upper = LinearRegression() model_Upper.fit(feature_train, target_Upper_train) UpperPredict = model_Upper.predict(feature_test) target_Lower = df_merge.iloc[:, 6] target_Lower_train = target_Lower['1970-01-01':'2010-01-01'] model_Lower = LinearRegression() model_Lower.fit(feature_train, target_Lower_train) LowerPredict = model_Lower.predict(feature_test) return UpperPredict, MedianPredict, LowerPredict
- std: standard deviation - min: minimum entry - 25%: first quantile - 50%: median or second quantile - 75%: third quantile - max: maximum entry """ data.describe() """INDEXING PANDAS TIME SERIES - datatime = object - parse_dates(boolean): Transform data to ISO 8601 (yyyy-mm-dd hh:mm:ss) format """ time_list = ['1992-03-08', '1992-04-12'] print(type(time_list[1])) # however we want it to be datatime object datetime_object = pd.to_datatime(time_list) print(type(datetime_object)) # close warnings import warnings warnings.filterwarnings('ignore') # In order to practice lets take head of pokemon data and add it a time list data2 = data.head() data_list = ['1992-01-10', '1992-02-10', '1993-03-15', '1993-03-16'] datatime_object = pd.to_datetime(date_list) data2['date'] = datatime_object # lets make date as index data2 = data2.set_index('date') print(data2)
from matplotlib import pylab import numpy as np from datetime import datatime import pandas as pd import DataAPI #数据API import seaborn as sns #美化matplotlib图表 sns.set_style('white') # 获取数据示例 secID = '510050.XSHG' start = '20160101' end = '20181101' security = DataAPI.MktFundGet( secID=secID, beginDate=start, endDate=end, field=['tradeDate', closePrice]) security['tradeDate'] = pd.to_datatime(security['tradeDate']) security = security.set_index('tradeDate') security.info() security.tail() #todo #画图表 security[closePrice].plot(grid=False, figsize=(12,8)) sns.despine() window_short = 20 #短期均线 window_long = 120 #长期均线 SD = 5% #偏离度阈值 #均值计算,numpy的移动平均函数:rolling_mean security['short_window'] = np.round(pd.rolling_mean(security['closePrice'], window=window_short), 2)
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV # 1. get the data data=pd.read_csv('./FBlocation/train.csv') # 2. basic data processing # 1) reduce the data scope data=data.query('x > 2 & x < 2.5 & y > 1 & y < 1.5') # 2) process timestamp time_value = pd.to_datatime(data['time'], unit='s') # type is Series date = pd.DatetimeIndex(time_value) data.loc[:, 'day'] = date.day data.loc[:, 'weekday'] = date.weekday data.loc[:, 'hour'] = date.hour # 3) filter places with less marks place_count = data.groupby('place_id').count()['row_id'] data_processed = data[data['place_id'].isin(place_count[place_count > 3].index.values)] # 4) filter characteristic values and target values x = data_processed[['x', 'y', 'accuracy', 'day', 'weekday', 'hour']] y = data_processed['place_id'] # 5) split the dataset x_train, x_test, y_train, y_test = train_test_split(x, y) # 3. feature engineering:standardization transfer = StandardScaler() x_train = transfer.fit_transform(x_train) x_test = transfer.transform(x_test)