Ejemplo n.º 1
0
 def test_ffill(self):
     result = merge_ordered(
         self.left, self.right, on='key', fill_method='ffill')
     expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
                           'lvalue': [1., 1, 2, 2, 3, 3.],
                           'rvalue': [nan, 1, 2, 3, 3, 4]})
     assert_frame_equal(result, expected)
Ejemplo n.º 2
0
    def test_basic(self):
        result = merge_ordered(self.left, self.right, on='key')
        expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
                              'lvalue': [1, nan, 2, nan, 3, nan],
                              'rvalue': [nan, 1, 2, 3, nan, 4]})

        assert_frame_equal(result, expected)
Ejemplo n.º 3
0
    def test_multigroup(self):
        left = pd.concat([self.left, self.left], ignore_index=True)

        left['group'] = ['a'] * 3 + ['b'] * 3

        result = merge_ordered(left, self.right, on='key', left_by='group',
                               fill_method='ffill')
        expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
                              'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
                              'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
        expected['group'] = ['a'] * 6 + ['b'] * 6

        assert_frame_equal(result, expected.loc[:, result.columns])

        result2 = merge_ordered(self.right, left, on='key', right_by='group',
                                fill_method='ffill')
        assert_frame_equal(result, result2.loc[:, result.columns])

        result = merge_ordered(left, self.right, on='key', left_by='group')
        assert result['group'].notna().all()
Ejemplo n.º 4
0
    def test_basic(self):
        result = merge_ordered(self.left, self.right, on="key")
        expected = DataFrame(
            {
                "key": ["a", "b", "c", "d", "e", "f"],
                "lvalue": [1, nan, 2, nan, 3, nan],
                "rvalue": [nan, 1, 2, 3, nan, 4],
            }
        )

        assert_frame_equal(result, expected)
Ejemplo n.º 5
0
    def test_multigroup(self):
        left = pd.concat([self.left, self.left], ignore_index=True)
        # right = concat([self.right, self.right], ignore_index=True)

        left["group"] = ["a"] * 3 + ["b"] * 3
        # right['group'] = ['a'] * 4 + ['b'] * 4

        result = merge_ordered(left, self.right, on="key", left_by="group", fill_method="ffill")
        expected = DataFrame(
            {
                "key": ["a", "b", "c", "d", "e", "f"] * 2,
                "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
                "rvalue": [nan, 1, 2, 3, 3, 4] * 2,
            }
        )
        expected["group"] = ["a"] * 6 + ["b"] * 6

        assert_frame_equal(result, expected.ix[:, result.columns])

        result2 = merge_ordered(self.right, left, on="key", right_by="group", fill_method="ffill")
        assert_frame_equal(result, result2.ix[:, result.columns])

        result = merge_ordered(left, self.right, on="key", left_by="group")
        self.assertTrue(result["group"].notnull().all())
Ejemplo n.º 6
0
    def test_doc_example(self):
        left = DataFrame({'key': ['a', 'c', 'e', 'a', 'c', 'e'],
                          'lvalue': [1, 2, 3] * 2,
                          'group': list('aaabbb')})

        right = DataFrame({'key': ['b', 'c', 'd'],
                           'rvalue': [1, 2, 3]})

        result = merge_ordered(left, right, fill_method='ffill',
                               left_by='group')

        expected = DataFrame({'group': list('aaaaabbbbb'),
                              'key': ['a', 'b', 'c', 'd', 'e'] * 2,
                              'lvalue': [1, 1, 2, 2, 3] * 2,
                              'rvalue': [nan, 1, 2, 3, 3] * 2})

        assert_frame_equal(result, expected)
Ejemplo n.º 7
0
import matplotlib.pyplot as plt
import datetime
import time
from scipy import signal
from collections import Counter

# building basic data frame
eurusd = pd.read_csv('eur_usd_hist.csv')
eurusd = eurusd.drop(eurusd.index[0:6])
eurusd.index = range(len(eurusd))
eurusd.columns = ['date', 'eurusdclose', 'high', 'low']
usdjpn = pd.read_csv('jpn_hist.csv')
usdjpn = usdjpn.drop(usdjpn.index[0:6])
usdjpn.index = range(len(usdjpn))
usdjpn.columns = ['date', 'usdjpyclose', 'usdcadclose']
df = pd.merge_ordered(eurusd, usdjpn, on='date')
df = df.drop(['high', 'low'], axis=1)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
#print(df.head())

#http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD
mindata = pd.read_csv('nov01_07_2011.csv', header=None)
mindata = mindata.drop(mindata.columns[[0, 1, 3, 4, 5]], axis=1)
mindata.columns = ['date', 'eurusdclose']
mindata['date'] = mindata.index
mindata['date'].iat[0]

#http://www.histdata.com/download-free-forex-historical-data/?/ascii/1-minute-bar-quotes/EURUSD
mindf_2017 = pd.read_csv('DAT_ASCII_EURUSD_M1_2017.csv', header=None, sep=';')
mindf_2017 = mindf_2017.drop(mindf_2017.columns[[0, 1, 3, 4, 5]], axis=1)
df = mindf_2017
Ejemplo n.º 8
0
 def time_merge_ordered(self):
     merge_ordered(self.left, self.right, on='key', left_by='group')
Ejemplo n.º 9
0
 def test_ffill(self):
     result = merge_ordered(self.left, self.right, on="key", fill_method="ffill")
     expected = DataFrame(
         {"key": ["a", "b", "c", "d", "e", "f"], "lvalue": [1.0, 1, 2, 2, 3, 3.0], "rvalue": [nan, 1, 2, 3, 3, 4]}
     )
     assert_frame_equal(result, expected)
Ejemplo n.º 10
0
df_train = pd.read_csv("../inputs/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../inputs/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../inputs/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek
Ejemplo n.º 11
0
import pandas as pd

df1 = pd.DataFrame({
    "key": ["a", "c", "e", "a", "c", "e"],
    "lvalue": [1, 2, 3, 1, 2, 3],
    "group": ["a", "a", "a", "b", "b", "b"]
})
df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
print(pd.merge_ordered(df1, df2, fill_method="ffill", left_by="group"))
Ejemplo n.º 12
0
# Create a date column using the month and year columns of ur_tall
ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month'])

# Sort ur_tall by date in ascending order
ur_sorted = ur_tall.sort_values(by='date')

# Plot the unempl_rate by date
ur_sorted.plot(y='unempl_rate', x='date')
plt.show()

# Check inverse correlation between dow jones and US treasury bond movement

# Use melt on ten_yr, unpivot everything besides the metric column
bond_perc = ten_yr.melt(id_vars='metric', var_name='date', value_name='close')

# Use query on bond_perc to select only the rows where metric=close
bond_perc_close = bond_perc.query('metric=="close"').drop('metric',
                                                          axis=1,
                                                          inplace=True)

# Merge (ordered) dji and bond_perc_close on date with an inner join
dow_bond = pd.merge_ordered(dji,
                            bond_perc_close,
                            on='date',
                            how='inner',
                            suffixes=('_dow', '_bond'))

# Plot only the close_dow and close_bond columns
dow_bond.plot(y=['close_dow', 'close_bond'], x='date', rot=90)
plt.show()
Ejemplo n.º 13
0
def RandomForest(df_x,df_y,winSize,winStep):
    '''
    -------- parameter----------
    DataFrame:{columns=["date","x1","x2", ..., "xn"]}
    DataFrame:{columns=["date","y"]}
    winSize: float
    winSteop: float
    
    ---------return----------
    DataFrame:{columns=["date","y"]}
    
    assumption:
    1. 'xi' has been sorted by 'date'
    2. 'y' cloumn in 'X0' has been shifted
        
    '''

    if isinstance(df_x, gftIO.GftTable):
        df_x = df_x.asColumnTab()
    if isinstance(df_y,dict):
        df_y = df_y["y"]
    if isinstance(df_y, gftIO.GftTable):
        df_y = df_y.asColumnTab()
    
    # convert parameter type
    winSize = int(winSize)
    winStep = int(winStep) 
    
    # NOTICE: integer will be regraged as O by GS, but classifier need int
    value_column = _findValueColumn(df_y.columns)  # value_column: value  
                                                   # df_y.columns:Index(['date', 'value'], dtype='object')
    df_y.rename(columns={value_column:"y"},inplace=True)
    df_y.y=pd.factorize(df_y.y)[0]
    # change column name
    for col_name in df_y.columns:
        if isinstance(df_y.ix[0,col_name],pd.Timestamp):
            df_y.rename(columns={col_name:"date"},inplace=True)
            break
    # remove meanless columns
    df_y=df_y[["date","y"]]
    
    # merge data
    df_x = df_x.sort_values("date",ascending=True)
    df_y = df_y.sort_values("date",ascending=True)
    df_y = df_y.set_index(np.arange(len(df_y))) # indentify index: start from 0

    # frequency error: if y_freq > x_freq, meanless data
    ls_missing_date=[d for d in list(df_y["date"]) if d not in list(df_x["date"])]
    if len(ls_missing_date)>0:
        raise ValueError("y_freq > x_freq. Missing date in X:", ls_missing_date)
    
    # slice data: remove redundant x
    if len(df_x)!=len(df_y):
        ls_slice_data=[d for d in list(df_x["date"]) if d not in list(df_y["date"])]
        df_tmp_x=df_x.set_index(["date"])
        df_tmp_x=df_tmp_x.drop(ls_slice_data)
        df_x=df_tmp_x.reset_index(np.arange(len(df_tmp_x)),drop=False)
    
    # identify index: start from 0
    df_x = df_x.set_index(np.arange(len(df_x)))
    df_y = df_y.set_index(np.arange(len(df_y))) 

    # data to be trained
    df_data=pd.merge_ordered(df_x,df_y,on="date",how="outer") 

    # value check
    if len(df_data.index) < winSize + 1:
        raise ValueError("the number of input data is not enough")
    
    # rooling
    ls_predicted=[]
    for i in range(len(df_data.index)):
        if i<winSize:
            ls_predicted+=[np.nan]
        else:
            start_index=i-winSize
            # fit
            n_x_train= df_data.iloc[start_index:i,1:-1].values
            n_y_train= df_data.iloc[start_index:i,-1].values
            _CLASSIFIER.fit(n_x_train, n_y_train)
            # predict
            n_x_test = df_data.iloc[[i],1:-1]
            y_test = _CLASSIFIER.predict(n_x_test)[0]
            ls_predicted += [y_test]
    
    df_data["predicted"]=ls_predicted
    #print(ls_predicted)
    
    # drop na
    df_data=df_data.dropna()
    #print(df_data)
    
    # scoressssssss
    y_true=pd.factorize(df_data["y"])[0]
    y_pred=pd.factorize(df_data["predicted"])[0]
    num_accuracy_score=accuracy_score(y_true,y_pred)
    #print("accuracy_score:",num_accuracy_score)
    num_f1_score=f1_score(y_true,y_pred,average='macro') # micor, weighted, None
    #print("f1_score:",num_f1_score)
    num_precision_score=precision_score(y_true, y_pred, average='macro') # micor, weighted, None
    #print("precision_score:",num_precision_score)
    num_recall_score=recall_score(y_true, y_pred, average='macro') # micor, weighted, None
    #print("recall_score:",num_recall_score)
    dict_score={"accuracy_score":num_accuracy_score, "f1_score":num_f1_score,"precision_score":num_precision_score, "recall_score":num_recall_score}
    
    # score
    y_test = df_data["predicted"].values
    X_test = df_data.iloc[:,1:-2].values
    num_mean_accuracy=_CLASSIFIER.score(X_test , y_test)
    #print(num_score)    
    
    '''
    # feature_importances
    ls_fitness=list(zip(df_data.iloc[:,1:-1],_CLASSIFIER.feature_importances_))
    n_fitness=np.array(list(map(list,ls_fitness)))
    df_fitness=pd.DataFrame({"feature":n_fitness[:,0],"importance":n_fitness[:,1]})
    #print(df_fitness)    
    '''
    
    # result
    df_data=df_data[["date","predicted"]]
    #print(df_data)
    
    dict_result = {"result":df_data,"mean_accuracy":num_mean_accuracy, "scores":dict_score} #,"fitness":df_fitness}
    #print(dict_result)
    return dict_result
Ejemplo n.º 14
0
# Concatenate medals: medals
medals = pd.concat(medals, keys=['bronze', 'silver', 'gold'])
#then slice the index 
idx = pd.IndexSlice
print(sales.loc[idx[:, 'Mediacore'], :])
#inner joins
medal_list = [bronze, silver, gold]
medals = pd.concat(medal_list, keys=['bronze', 'silver', 'gold'], axis=1, join='inner')
#merging
merge_by_id = pd.merge(revenue,managers,on='branch_id')
#inner join
combined = pd.merge(revenue,managers,left_on='city',right_on='branch')
#right join
 pd.merge(sales, managers, left_on=['city','state'], right_on=['branch','state'], how='left')
 #this is default to outer join
 tx_weather_ffill = pd.merge_ordered(austin , houston,on='date',suffixes=['_aus','_hus'],fill_method='ffill')
 
 #case study
 
 # Import pandas
import pandas as pd

# Create empty dictionary: medals_dict
medals_dict = {}
for year in editions['Edition']:
    file_path = 'summer_{:d}.csv'.format(year)
    medals_dict[year] = pd.read_csv(file_path)
    medals_dict[year] = medals_dict[year][['Athlete', 'NOC', 'Medal']]
    medals_dict[year]['Edition'] = year
    
# Concatenate medals_dict: medals
Ejemplo n.º 15
0
 def time_merge_ordered(self):
     merge_ordered(self.left, self.right, on="key", left_by="group")
Ejemplo n.º 16
0
def add_dilation_to_fxd(GZD, FXD):
    GZD['dilation'] = l_r_dilation(GZD)
    FXD = pd.merge_ordered(FXD, GZD[['timestamp', 'dilation']], fill_method='ffill', left_by='timestamp')
    avg = FXD['dilation'].mean()
    FXD['dilation'] = FXD['dilation'].fillna(avg)
    return FXD
Ejemplo n.º 17
0
 def time_merge_ordered(self):
     merge_ordered(self.left, self.right, on='key', left_by='group')
import numpy as np
import pandas as pd
from keras.layers import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
import lstm, time
from sklearn import model_selection, preprocessing

"""Experimental Doesn't work""""

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
macro = pd.read_csv("macro.csv", usecols=macro_features)

macro_train = pd.merge_ordered(train, macro, on='timestamp', how='left')
macro_test = pd.merge_ordered(test, macro, on='timestamp', how='left')

id_test = macro_test.id
y_train = macro_train["price_doc"]
x_train = macro_train.drop(["id", "timestamp", "price_doc"], axis=1)
x_test = macro_test.drop(["id", "timestamp"], axis=1)

print x_train.shape

#Step1 - Perform preprocessing
for c in x_train.columns:
    if x_train[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values))
        x_train[c] = lbl.transform(list(x_train[c].values))
Ejemplo n.º 19
0
        total_distance = total_distance + \
            np.sqrt(pow((dfcity.X[city_num] - dfcity.X[prev_city]),2) + pow((dfcity.Y[city_num] - dfcity.Y[prev_city]),2)) * \
            (1+ 0.1*((step_num % 10 == 0)*int(not(prime_cities[prev_city]))))
        prev_city = next_city
        step_num = step_num + 1
    return total_distance

dumbest_path = list(df_cities.CityId[:].append(pd.Series([0])))
print('Total distance with the dumbest path is '+ "{:,}".format(total_distance(df_cities,dumbest_path)))

# ### Let us take a look at the first 100 steps of the dumbest path

# In[ ]:


df_path = pd.merge_ordered(pd.DataFrame({'CityId':dumbest_path}),df_cities,on=['CityId'])
fig, ax = plt.subplots(figsize=(20,20))
ax.plot(df_path.iloc[0:100,]['X'], df_path.iloc[0:100,]['Y'],marker = 'o')
for i, txt in enumerate(df_path.iloc[0:100,]['CityId']):
    ax.annotate(txt, (df_path.iloc[0:100,]['X'][i], df_path.iloc[0:100,]['Y'][i]),size = 15)

# ### As we can see, the dumbest path seems pretty bad. We are sending Santa all over the map, without any consideration for him whatsoever :)
# 
# ## Slightly better path: sort the cities in X,Y coordinate_order

# In[ ]:


sorted_cities = list(df_cities.iloc[1:,].sort_values(['X','Y'])['CityId'])
sorted_cities = [0] + sorted_cities + [0]
print('Total distance with the sorted city path is '+ "{:,}".format(total_distance(df_cities,sorted_cities)))