Esempio n. 1
0
 def to_datetime(self, time_feats):
     """
     df is the DataFrame
     time_feats is a list of the features to be changed to datetime format
     """
     for col in self.data:
         if col in time_feats:
             self.data[col] = pd.to_datatime(self.data[col])
Esempio n. 2
0
def get_fremont_data(filename='Fremont.csv',
                     url=FREMONT_URL,
                     force_download=False):
    """
    Download and cache fremont data
    """
    if force_download or not os.path.exists(filename):
        print("downloading data")
        urlretrieve(URL, filename)
    data = pd.read_csv('Fremont.csv', index_col='Date')
    try:
        data.index = pd.to_datetime(data.index, format='%m/%d/%Y %H:%M:%S %p')
    except TypeError:
        data.index = pd.to_datatime(data.index)
    data.columns = ['West', 'East']
    data['Total'] = data['West'] + data['East']
    return data
Esempio n. 3
0
def get_fremont_data(filename=FREMONT_FILE,
                     url=FREMONT_URL,
                     force_download=False):
    """Download and cache the Fremont bike data
    
    Parameters
    ----------
    filename: string (optional)
        Location of the downloaded file
    
    url: string (optional)
        Web location of the data
        
    force_download: bool (optional)
        if True, force re-download of the data
    
    Returns
    -------
    data: pandas.DataFrame
        The Fremont bridge data 
            index: Date (datetime)
            columns: East, West, Total (number)
    """
    if force_download or not os.path.exists(filename):
        urlretrieve(url, 'fremont.csv')

    cols = [
        "Date", "Fremont Bridge East Sidewalk", "Fremont Bridge West Sidewalk"
    ]

    data = pd.read_csv('fremont.csv', index_col='Date', usecols=cols)
    try:
        data.index = pd.to_datetime(data.index, format='%m/%d/%Y %I:%M:%S %p')
    except TypeError:
        data.index = pd.to_datatime(data.index)

    data.columns = ['East', 'West']
    data['Total'] = data['East'] + data['West']
    return data
Esempio n. 4
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import numpy as np

crimes = pd.read_csv("Chicago_Crimes_2012_to_2017.csv")
crimes.head(2)

crimes = crimes[(crimes['Year'] == 2016) | (crimes['Year'] == 2015)]
crimes['Date'] = pd.to_datatime(crimes['Date'], format='%m/%d/%Y %I:%M:%S %p')

crimes['Month'] = (
    crimes['Date'].dt.month).apply(lambda x: calendar.month_abbr[x])

crimes['Month'] = pd.Categorical(crimes['Month'],
                                 categories=[
                                     'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                                     'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
                                 ],
                                 ordered=True)

crimes["Weekday"] = crimes['Data'].dt.weekday_name
crimes['Weekday'] = pd.Categorical(crimes['Weekday'],
                                   categories=[
                                       'Monday', 'Tuesday', 'Wednesday',
                                       'Thursday', 'Friday', 'Saturday',
                                       'Sunday'
                                   ],
                                   ordered=True)
crimes.head()
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go

# Load CSV file from Datasets folder
df = pd.read_csv('../Datasets/CoronaTimeSeries.csv')
df['Date'] = pd.to_datatime(df['Date'])

# Preparing data
trace1 = go.Scatter(x=df['Date'], y=df['Death'], mode='lines', name='Death')
trace2 = go.Scatter(x=df['Date'],
                    y=df['Recovered'],
                    mode='lines',
                    name='Recovered')
trace3 = go.Scatter(x=df['Date'],
                    y=df['Unrecovered'],
                    mode='lines',
                    name='Unrecovered')
data = [trace1, trace2, trace3]

# Preparing layout
layout = go.Layout(
    title=
    'Corona Virus Death and Recovered Cases From 2020-01-22 to 2020-03-17',
    xaxis_title="Date",
    yaxis_title="Number of cases")

# Plot the figure and saving in a html file
fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='multilinechart.html')
def process_log_file(cur, filepath):
    # open log file
    df = pd.read_json(filepath, line=True)

    # filter by NextSong action
    df = df[df['page'] == 'NextSong']

    # convert timestamp column to datetime
    t = pd.to_datetime(df['ts'], unit='ms')
    
    # insert time data records
    time_data = pd.concat([
        t,
        t.dt.hour,
        t.dt.day,
        t.dt.week,
        t.dt.month,
        t.dt.year,
        t.dt.weekday
    ], axis=1
    )
    column_labels = [
        'start_time',
        'hour',
        'day',
        'week',
        'month',
        'year',
        'weekday'
    ]
    time_df = pd.DataFrame(dat=time_data.values, columns=column_labels)

    for i, row in time_df.iterrows():
        cur.execute(time_table_insert, list(row))

    # load user table
    user_df = df[['userID', 'firstName', 'lastName', 'gender', 'level']]

    # insert user records
    for i, row in user_df.iterrows():
        cur.execute(user_table_insert, row)

    # insert songplay records
    for index, row in df.iterrows():
        
        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()
        
        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = (
            pd.to_datatime(row.ts, unit='ms'),
            row.userID,
            row.level,
            songid,
            artistid,
            row.sessionId,
            row.location,
            row.userAgent
        )
        cur.execute(songplay_table_insert, songplay_data)
Esempio n. 7
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#1.obtain dataset
data = pd.read_csv("~~")
#2.deal data
#a. reduce the range of data
data = data.query("x<2.5&x>2.0&y<1.5&y>1.0")
#2.0<x<2.5
#1.0<y<1.5

#b. time->specific
time_value = pd.to_datatime(data["time"], unit="s")
print(time_value.values)
date = pd.DatatimeIndex(time_value)
data["day"] = date.day
data["weekday"] = date.weekday
data["hour"] = date.hour

#c.filter the few checkin places
place_count = data.groupby("place_id").count()["row.id"]
place_count[place_count > 3]
data_final = data["place_id"].isin(place_count[place_count > 3].index.values)
#feature value  + target value
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

#3.standard scaler
Esempio n. 8
0
import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from statsmodels.tsa.stattolls import adfuller  #ADF检验
from statsmodels.stats.diagnostic import acorr_ljungbox  #白噪声检验
from statsmodels.graphics.tsplots import plot_acf,plot_pacf  #画图定阶

from statsmodels.tsa.armima_model import ARIMA
from statsmodels.tsa.armima_model import ARMA


df = pd.read_csv('',encoding=''utf-8', index_col='')
#1
dataframe = pd.DataFrame({'time':df[],'values':df[]})
df['time'] = pd.to_datatime(df['time'])     #将默认索引转化为时间索引
df.set_index("time", inplace=True)
#2
df.index = pd.to_datetime(df.index)
ts = df[]

#差分法实现数据平稳性
def stationarity(timeseries):
	diff1 = timeseries.diff(1).dropna()
	diff2 = diff1.diff(1)
	diff1.plot(color = 'red',title= 'diff 1',figsize=(10,4))
	diff2.plot(color = 'black',title= 'diff 2',figsize=(10,4))
	
#ADF检验
x = np.array(diff1['value'])  #看数据放一阶差分还是二阶
adftest = adfuller(x,autolag = 'AIC'=)
Esempio n. 9
0
Age20.plot(kind='hist',bins=80,rwidth=0.8,alpha=0.3,range=(0,80))
plt.show()





# 时间序列
test = pd.read_csv('./test.csv',parse_dates=True, index_col='Date')#把时间当标识
test = test.drop(columns='Unnamed: 0')#去掉默认的标识

test.loc['2019-08-01 08:00:00','Age'] #定位
test.loc['2019']
test.loc['2019-Oct-01'] #用了标准的时间,用oct这种也是可以的

time = pd.to_datatime(['2019-08-01 08:00','2019-08-01 20:00'])#string转成标准格式
time
test.reindex(time) #用变了之后的格式重新排列行,如果time有test没有的会NaN很方便
test.reindex(time,method='ffill') #跟上一条一样,NaN的填补
test.reindex(time,method='ffill') #跟下一条一样

# 时间resample,重新改变数据的粒度
daily_mean=test.resample('D').mean #按day,如果没有改天就NaN
daily_mean=test.resample('M').mean #按month
daily_mean=test.resample('6W').mean #每6周


# 其他
test['name'].str.upper()
test['name'].str.contains('i')#包含i
test['Date'].dt.hour #得到时
Esempio n. 10
0
def Temperature():
    df_GreenhouseGas = pd.read_csv('GreenhouseGas.csv','r',header = 0)
    df_GlobalSurfaceTemperature = pd.read_csv('GlobalSurfaceTemperature.csv','r',header=0)
    df_CO2ppm = pd.read_csv('CO2ppm.csv','r')
 
    
    df_GreenhouseGas_new = pd.DataFrame(df_GreenhouseGas[['N2O', 'CH4', 'CO2']].values,index=pd.to_datatime(df_GreenhouseGas['Year'].astype(str)),columns = ['N2O','CH4', 'CO2'])
    df_GlobalSurfaceTemperature_new = pd.DataFrame(df_GlobalSurfaceTemperature[['Median','Upper','Lower']].values,index = pd.to_datetime(df_GlobalSurfaceTemperature['Year'].astype(str)),columns=['Median','Upper', 'Lower'])
    df_CO2ppm_new = pd.DataFrame(df_CO2ppm.iloc[:, 1].values,index = pd.to_datetime(df_CO2ppm['Year'].astype(str)),columns=['CO2_PPM'])
    df_merge = pd.concat([df_GreenhouseGas_new,df_CO2ppm_new,df_GlobalSurfaceTemperature_new],axis=1)

    feature = df_merge.iloc[:, 0:4].fillna(method='ffill').fillna(method='bfill')
    feature_train = feature['1970-01-01':'2010-01-01']
    feature_test = feature['2011-01-01':'2017-01-01']

    target_Median = df_merge.iloc[:, 4]
    target_Median_train = target_Median['1970-01-01':'2010-01-01']
    model_Median = LinearRegression()
    model_Median.fit(feature_train, target_Median_train)

    MedianPredict = model_Median.predict(feature_test)

    target_Upper = df_merge.iloc[:, 5]
    target_Upper_train = target_Upper['1970-01-01':'2010-01-01']
    model_Upper = LinearRegression()
    model_Upper.fit(feature_train, target_Upper_train)
    UpperPredict = model_Upper.predict(feature_test)

    target_Lower = df_merge.iloc[:, 6]
    target_Lower_train = target_Lower['1970-01-01':'2010-01-01']
    model_Lower = LinearRegression()
    model_Lower.fit(feature_train, target_Lower_train)
    LowerPredict = model_Lower.predict(feature_test)

    return UpperPredict, MedianPredict, LowerPredict
Esempio n. 11
0
 - std: standard deviation
 - min: minimum entry
 - 25%: first quantile
 - 50%: median or second quantile
 - 75%: third quantile
 - max: maximum entry
"""
data.describe()
"""INDEXING PANDAS TIME SERIES
 - datatime = object
 - parse_dates(boolean): Transform data to ISO 8601 (yyyy-mm-dd hh:mm:ss) format
"""
time_list = ['1992-03-08', '1992-04-12']
print(type(time_list[1]))
# however we want it to be datatime object
datetime_object = pd.to_datatime(time_list)
print(type(datetime_object))

# close warnings
import warnings
warnings.filterwarnings('ignore')

# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
data_list = ['1992-01-10', '1992-02-10', '1993-03-15', '1993-03-16']
datatime_object = pd.to_datetime(date_list)
data2['date'] = datatime_object
# lets make date as index
data2 = data2.set_index('date')
print(data2)
Esempio n. 12
0
from matplotlib import pylab
import numpy as np
from datetime import datatime
import pandas as pd
import DataAPI  #数据API
import seaborn as sns  #美化matplotlib图表
sns.set_style('white')

# 获取数据示例
secID = '510050.XSHG'
start = '20160101'
end = '20181101'

security = DataAPI.MktFundGet(
    secID=secID, beginDate=start, endDate=end, field=['tradeDate', closePrice])
security['tradeDate'] = pd.to_datatime(security['tradeDate'])
security = security.set_index('tradeDate')
security.info()

security.tail()  #todo

#画图表
security[closePrice].plot(grid=False, figsize=(12,8))
sns.despine()

window_short = 20 #短期均线
window_long = 120 #长期均线
SD = 5%  #偏离度阈值

#均值计算,numpy的移动平均函数:rolling_mean
security['short_window'] = np.round(pd.rolling_mean(security['closePrice'], window=window_short), 2)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# 1. get the data
data=pd.read_csv('./FBlocation/train.csv')

# 2. basic data processing
# 1) reduce the data scope
data=data.query('x > 2 &  x < 2.5 & y > 1 & y < 1.5')
# 2) process timestamp
time_value = pd.to_datatime(data['time'], unit='s')  # type is Series
date = pd.DatetimeIndex(time_value)
data.loc[:, 'day'] = date.day
data.loc[:, 'weekday'] = date.weekday
data.loc[:, 'hour'] = date.hour
# 3) filter places with less marks
place_count = data.groupby('place_id').count()['row_id']
data_processed = data[data['place_id'].isin(place_count[place_count > 3].index.values)]
# 4) filter characteristic values and target values
x = data_processed[['x', 'y', 'accuracy', 'day', 'weekday', 'hour']]
y = data_processed['place_id']
# 5) split the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y)

# 3. feature engineering:standardization
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)