def test_ffill(self): result = merge_ordered( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], 'rvalue': [nan, 1, 2, 3, 3, 4]}) assert_frame_equal(result, expected)
def test_basic(self): result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) assert_frame_equal(result, expected)
def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) left['group'] = ['a'] * 3 + ['b'] * 3 result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) expected['group'] = ['a'] * 6 + ['b'] * 6 assert_frame_equal(result, expected.loc[:, result.columns]) result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.loc[:, result.columns]) result = merge_ordered(left, self.right, on='key', left_by='group') assert result['group'].notna().all()
def test_basic(self): result = merge_ordered(self.left, self.right, on="key") expected = DataFrame( { "key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1, nan, 2, nan, 3, nan], "rvalue": [nan, 1, 2, 3, nan, 4], } ) assert_frame_equal(result, expected)
def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) # right = concat([self.right, self.right], ignore_index=True) left["group"] = ["a"] * 3 + ["b"] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 result = merge_ordered(left, self.right, on="key", left_by="group", fill_method="ffill") expected = DataFrame( { "key": ["a", "b", "c", "d", "e", "f"] * 2, "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, "rvalue": [nan, 1, 2, 3, 3, 4] * 2, } ) expected["group"] = ["a"] * 6 + ["b"] * 6 assert_frame_equal(result, expected.ix[:, result.columns]) result2 = merge_ordered(self.right, left, on="key", right_by="group", fill_method="ffill") assert_frame_equal(result, result2.ix[:, result.columns]) result = merge_ordered(left, self.right, on="key", left_by="group") self.assertTrue(result["group"].notnull().all())
def test_doc_example(self): left = DataFrame({'key': ['a', 'c', 'e', 'a', 'c', 'e'], 'lvalue': [1, 2, 3] * 2, 'group': list('aaabbb')}) right = DataFrame({'key': ['b', 'c', 'd'], 'rvalue': [1, 2, 3]}) result = merge_ordered(left, right, fill_method='ffill', left_by='group') expected = DataFrame({'group': list('aaaaabbbbb'), 'key': ['a', 'b', 'c', 'd', 'e'] * 2, 'lvalue': [1, 1, 2, 2, 3] * 2, 'rvalue': [nan, 1, 2, 3, 3] * 2}) assert_frame_equal(result, expected)
import matplotlib.pyplot as plt import datetime import time from scipy import signal from collections import Counter # building basic data frame eurusd = pd.read_csv('eur_usd_hist.csv') eurusd = eurusd.drop(eurusd.index[0:6]) eurusd.index = range(len(eurusd)) eurusd.columns = ['date', 'eurusdclose', 'high', 'low'] usdjpn = pd.read_csv('jpn_hist.csv') usdjpn = usdjpn.drop(usdjpn.index[0:6]) usdjpn.index = range(len(usdjpn)) usdjpn.columns = ['date', 'usdjpyclose', 'usdcadclose'] df = pd.merge_ordered(eurusd, usdjpn, on='date') df = df.drop(['high', 'low'], axis=1) df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') #print(df.head()) #http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD mindata = pd.read_csv('nov01_07_2011.csv', header=None) mindata = mindata.drop(mindata.columns[[0, 1, 3, 4, 5]], axis=1) mindata.columns = ['date', 'eurusdclose'] mindata['date'] = mindata.index mindata['date'].iat[0] #http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD mindf_2017 = pd.read_csv('DAT_ASCII_EURUSD_M1_2017.csv', header=None, sep=';') mindf_2017 = mindf_2017.drop(mindf_2017.columns[[0, 1, 3, 4, 5]], axis=1) df = mindf_2017
def time_merge_ordered(self): merge_ordered(self.left, self.right, on='key', left_by='group')
def test_ffill(self): result = merge_ordered(self.left, self.right, on="key", fill_method="ffill") expected = DataFrame( {"key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1.0, 1, 2, 2, 3, 3.0], "rvalue": [nan, 1, 2, 3, 3, 4]} ) assert_frame_equal(result, expected)
df_train = pd.read_csv("../inputs/train.csv", parse_dates=['timestamp']) df_test = pd.read_csv("../inputs/test.csv", parse_dates=['timestamp']) df_macro = pd.read_csv("../inputs/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols) # ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130 ylog_train_all = np.log1p(df_train['price_doc'].values) id_test = df_test['id'] df_train.drop(['id', 'price_doc'], axis=1, inplace=True) df_test.drop(['id'], axis=1, inplace=True) # Build df_all = (df_train+df_test).join(df_macro) num_train = len(df_train) df_all = pd.concat([df_train, df_test]) df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left') print(df_all.shape) # Add month-year month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100) month_year_cnt_map = month_year.value_counts().to_dict() df_all['month_year_cnt'] = month_year.map(month_year_cnt_map) # Add week-year count week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100) week_year_cnt_map = week_year.value_counts().to_dict() df_all['week_year_cnt'] = week_year.map(week_year_cnt_map) # Add month and day-of-week df_all['month'] = df_all.timestamp.dt.month df_all['dow'] = df_all.timestamp.dt.dayofweek
import pandas as pd df1 = pd.DataFrame({ "key": ["a", "c", "e", "a", "c", "e"], "lvalue": [1, 2, 3, 1, 2, 3], "group": ["a", "a", "a", "b", "b", "b"] }) df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) print(pd.merge_ordered(df1, df2, fill_method="ffill", left_by="group"))
# Create a date column using the month and year columns of ur_tall ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month']) # Sort ur_tall by date in ascending order ur_sorted = ur_tall.sort_values(by='date') # Plot the unempl_rate by date ur_sorted.plot(y='unempl_rate', x='date') plt.show() # Check inverse correlation between dow jones and US treasury bond movement # Use melt on ten_yr, unpivot everything besides the metric column bond_perc = ten_yr.melt(id_vars='metric', var_name='date', value_name='close') # Use query on bond_perc to select only the rows where metric=close bond_perc_close = bond_perc.query('metric=="close"').drop('metric', axis=1, inplace=True) # Merge (ordered) dji and bond_perc_close on date with an inner join dow_bond = pd.merge_ordered(dji, bond_perc_close, on='date', how='inner', suffixes=('_dow', '_bond')) # Plot only the close_dow and close_bond columns dow_bond.plot(y=['close_dow', 'close_bond'], x='date', rot=90) plt.show()
def RandomForest(df_x,df_y,winSize,winStep): ''' -------- parameter---------- DataFrame:{columns=["date","x1","x2", ..., "xn"]} DataFrame:{columns=["date","y"]} winSize: float winSteop: float ---------return---------- DataFrame:{columns=["date","y"]} assumption: 1. 'xi' has been sorted by 'date' 2. 'y' cloumn in 'X0' has been shifted ''' if isinstance(df_x, gftIO.GftTable): df_x = df_x.asColumnTab() if isinstance(df_y,dict): df_y = df_y["y"] if isinstance(df_y, gftIO.GftTable): df_y = df_y.asColumnTab() # convert parameter type winSize = int(winSize) winStep = int(winStep) # NOTICE: integer will be regraged as O by GS, but classifier need int value_column = _findValueColumn(df_y.columns) # value_column: value # df_y.columns:Index(['date', 'value'], dtype='object') df_y.rename(columns={value_column:"y"},inplace=True) df_y.y=pd.factorize(df_y.y)[0] # change column name for col_name in df_y.columns: if isinstance(df_y.ix[0,col_name],pd.Timestamp): df_y.rename(columns={col_name:"date"},inplace=True) break # remove meanless columns df_y=df_y[["date","y"]] # merge data df_x = df_x.sort_values("date",ascending=True) df_y = df_y.sort_values("date",ascending=True) df_y = df_y.set_index(np.arange(len(df_y))) # indentify index: start from 0 # frequency error: if y_freq > x_freq, meanless data ls_missing_date=[d for d in list(df_y["date"]) if d not in list(df_x["date"])] if len(ls_missing_date)>0: raise ValueError("y_freq > x_freq. Missing date in X:", ls_missing_date) # slice data: remove redundant x if len(df_x)!=len(df_y): ls_slice_data=[d for d in list(df_x["date"]) if d not in list(df_y["date"])] df_tmp_x=df_x.set_index(["date"]) df_tmp_x=df_tmp_x.drop(ls_slice_data) df_x=df_tmp_x.reset_index(np.arange(len(df_tmp_x)),drop=False) # identify index: start from 0 df_x = df_x.set_index(np.arange(len(df_x))) df_y = df_y.set_index(np.arange(len(df_y))) # data to be trained df_data=pd.merge_ordered(df_x,df_y,on="date",how="outer") # value check if len(df_data.index) < winSize + 1: raise ValueError("the number of input data is not enough") # rooling ls_predicted=[] for i in range(len(df_data.index)): if i<winSize: ls_predicted+=[np.nan] else: start_index=i-winSize # fit n_x_train= df_data.iloc[start_index:i,1:-1].values n_y_train= df_data.iloc[start_index:i,-1].values _CLASSIFIER.fit(n_x_train, n_y_train) # predict n_x_test = df_data.iloc[[i],1:-1] y_test = _CLASSIFIER.predict(n_x_test)[0] ls_predicted += [y_test] df_data["predicted"]=ls_predicted #print(ls_predicted) # drop na df_data=df_data.dropna() #print(df_data) # scoressssssss y_true=pd.factorize(df_data["y"])[0] y_pred=pd.factorize(df_data["predicted"])[0] num_accuracy_score=accuracy_score(y_true,y_pred) #print("accuracy_score:",num_accuracy_score) num_f1_score=f1_score(y_true,y_pred,average='macro') # micor, weighted, None #print("f1_score:",num_f1_score) num_precision_score=precision_score(y_true, y_pred, average='macro') # micor, weighted, None #print("precision_score:",num_precision_score) num_recall_score=recall_score(y_true, y_pred, average='macro') # micor, weighted, None #print("recall_score:",num_recall_score) dict_score={"accuracy_score":num_accuracy_score, "f1_score":num_f1_score,"precision_score":num_precision_score, "recall_score":num_recall_score} # score y_test = df_data["predicted"].values X_test = df_data.iloc[:,1:-2].values num_mean_accuracy=_CLASSIFIER.score(X_test , y_test) #print(num_score) ''' # feature_importances ls_fitness=list(zip(df_data.iloc[:,1:-1],_CLASSIFIER.feature_importances_)) n_fitness=np.array(list(map(list,ls_fitness))) df_fitness=pd.DataFrame({"feature":n_fitness[:,0],"importance":n_fitness[:,1]}) #print(df_fitness) ''' # result df_data=df_data[["date","predicted"]] #print(df_data) dict_result = {"result":df_data,"mean_accuracy":num_mean_accuracy, "scores":dict_score} #,"fitness":df_fitness} #print(dict_result) return dict_result
# Concatenate medals: medals medals = pd.concat(medals, keys=['bronze', 'silver', 'gold']) #then slice the index idx = pd.IndexSlice print(sales.loc[idx[:, 'Mediacore'], :]) #inner joins medal_list = [bronze, silver, gold] medals = pd.concat(medal_list, keys=['bronze', 'silver', 'gold'], axis=1, join='inner') #merging merge_by_id = pd.merge(revenue,managers,on='branch_id') #inner join combined = pd.merge(revenue,managers,left_on='city',right_on='branch') #right join pd.merge(sales, managers, left_on=['city','state'], right_on=['branch','state'], how='left') #this is default to outer join tx_weather_ffill = pd.merge_ordered(austin , houston,on='date',suffixes=['_aus','_hus'],fill_method='ffill') #case study # Import pandas import pandas as pd # Create empty dictionary: medals_dict medals_dict = {} for year in editions['Edition']: file_path = 'summer_{:d}.csv'.format(year) medals_dict[year] = pd.read_csv(file_path) medals_dict[year] = medals_dict[year][['Athlete', 'NOC', 'Medal']] medals_dict[year]['Edition'] = year # Concatenate medals_dict: medals
def time_merge_ordered(self): merge_ordered(self.left, self.right, on="key", left_by="group")
def add_dilation_to_fxd(GZD, FXD): GZD['dilation'] = l_r_dilation(GZD) FXD = pd.merge_ordered(FXD, GZD[['timestamp', 'dilation']], fill_method='ffill', left_by='timestamp') avg = FXD['dilation'].mean() FXD['dilation'] = FXD['dilation'].fillna(avg) return FXD
import numpy as np import pandas as pd from keras.layers import Dense, Activation, Dropout from keras.layers.recurrent import LSTM from keras.models import Sequential import lstm, time from sklearn import model_selection, preprocessing """Experimental Doesn't work"""" train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") macro = pd.read_csv("macro.csv", usecols=macro_features) macro_train = pd.merge_ordered(train, macro, on='timestamp', how='left') macro_test = pd.merge_ordered(test, macro, on='timestamp', how='left') id_test = macro_test.id y_train = macro_train["price_doc"] x_train = macro_train.drop(["id", "timestamp", "price_doc"], axis=1) x_test = macro_test.drop(["id", "timestamp"], axis=1) print x_train.shape #Step1 - Perform preprocessing for c in x_train.columns: if x_train[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(x_train[c].values)) x_train[c] = lbl.transform(list(x_train[c].values))
total_distance = total_distance + \ np.sqrt(pow((dfcity.X[city_num] - dfcity.X[prev_city]),2) + pow((dfcity.Y[city_num] - dfcity.Y[prev_city]),2)) * \ (1+ 0.1*((step_num % 10 == 0)*int(not(prime_cities[prev_city])))) prev_city = next_city step_num = step_num + 1 return total_distance dumbest_path = list(df_cities.CityId[:].append(pd.Series([0]))) print('Total distance with the dumbest path is '+ "{:,}".format(total_distance(df_cities,dumbest_path))) # ### Let us take a look at the first 100 steps of the dumbest path # In[ ]: df_path = pd.merge_ordered(pd.DataFrame({'CityId':dumbest_path}),df_cities,on=['CityId']) fig, ax = plt.subplots(figsize=(20,20)) ax.plot(df_path.iloc[0:100,]['X'], df_path.iloc[0:100,]['Y'],marker = 'o') for i, txt in enumerate(df_path.iloc[0:100,]['CityId']): ax.annotate(txt, (df_path.iloc[0:100,]['X'][i], df_path.iloc[0:100,]['Y'][i]),size = 15) # ### As we can see, the dumbest path seems pretty bad. We are sending Santa all over the map, without any consideration for him whatsoever :) # # ## Slightly better path: sort the cities in X,Y coordinate_order # In[ ]: sorted_cities = list(df_cities.iloc[1:,].sort_values(['X','Y'])['CityId']) sorted_cities = [0] + sorted_cities + [0] print('Total distance with the sorted city path is '+ "{:,}".format(total_distance(df_cities,sorted_cities)))