# ARIMA MODEL #3: STAGE 2, MATCH EVENTS ONLY # LET'S ONLY FOCUS ON STAGE 2...makes more sense to consider match events in the context of the match itself, # and not the buildup or post-match reaction time... stage_2_df = longform_df >> sift(X.stage_2_ind == 1) stage_2_df = stage_2_df.reset_index(drop=True) # new, more thoughtful ARIMA model parameters # d = 1 ("first difference"); let's predict the delta b/w volumes at consecutive intervals # aka, "stationarizing" the time series # q = 1 (a series displays moving average behavior if it apparently undergoes random # "shocks" whose effects are felt in 2+ consecutive periods. ) # TODO, diff b/w q = 1 & q = 2? x_mat = stage_2_df >> select(stage_2_df.home_goal, stage_2_df.away_goal, stage_2_df.home_yellow, stage_2_df.away_yellow, stage_2_df.home_red, stage_2_df.away_red, stage_2_df.competitive_idx) model = ARIMA(endog=stage_2_df.shorthand_search_vol, exog=x_mat, dates=stage_2_df.date_time, order=(0, 1, 1)) # try .predict() for a couple matches at a time # might be somewhere in arima where you have to indicate that there's multiple overlapping time series # these are "uncorrelated" time series model_fit = model.fit(disp=0) # disp=0 turns off debug information with open('model3.txt', 'w') as f: # print summary print >> f, model_fit.summary()
def test_ReadPandas_dply(): filepath = 'tests/data/pandas_table.csv' samples = ( ReadPandas(filepath).dply() >> dp.select(dp.X.col1) >> DplyToList()) nt.assert_equal(samples, [[1], [2], [3]])
# pip installation pip install dfply # conda installation conda install -c tallic dfply from dfply import * women = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-12-08/women.csv') df = women # chaining operations on the data with the >> operator, or alternatively # starting with >>= for inplace operations. # Columns can be specified either by their name (string) or an integer lowprice = diamonds >> head(10) >> tail(3) diamonds >> select(X.carat, X.cut) >> head(3) diamonds >> select(1, X.price, ['x', 'y']) >> head(2) diamonds >> drop(1, X.price, ['x', 'y']) >> head(2) diamonds >> select(~X.carat, ~X.color, ~X.clarity) >> head(2) # not operator, returns same as above drop statement # select filters diamonds >> select(starts_with('c')) >> head(2) diamonds >> drop(columns_from(X.price)) >> head(2) # mixing techniques to select first 2 cols, 'category' col and last 2 cols diamonds >> select(columns_to(1, inclusive=True), 'depth', columns_from(-2)) >> head(2) ''' starts_with(prefix): find columns that start with a string prefix. ends_with(suffix): find columns that end with a string suffix. contains(substr): find columns that contain a substring in their name. everything(): all columns. columns_between(start_col, end_col, inclusive=True): find columns between a specified start and end column. The inclusive boolean keyword argument indicates whether the end column should be included or not.
# match(variable, colnames(iris)) # colnames(iris)[colnames(iris) %in% variable] variable='SepalLength' variable in iris.columns #True iris.iloc[:,iris.columns==variable] # library(dplyr) import dplython as dp iris = dp.DplyFrame(iris) from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) # data(iris) # data=iris %>% # select(Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species) iris >> dp.select(X.Species) >> dp.head() iris[['Species', 'PetalLength']] iris.drop('SepalLength', axis=1) #quitar esa columna iris.drop(5, axis=0) #quitar la sexta fila # data=iris %>% # filter(Petal.Length>1 & Petal.Length<100) iris >> dp.sift(X.PetalLength>5) iris[(iris['PetalLength']>5) & (iris['PetalLength']<6)] # data=iris %>% # dplyr::group_by(Species) %>% # summarise(media=mean(Petal.Length)) iris >> dp.group_by(X.Species) >> dp.summarize(media=X.PetalLength.mean()) iris.groupby(['Species'])['PetalLength'].agg(['mean', 'sum', 'count'])
bigr = output[output['word'].str.contains("_")] """FROM THIS PART, 2 STRATEGIES, SAVE THE OUTPUT AND CONTINUE W R OR GO AHEAD W PYTHON""" """5 plotting""" """5 1 aggregating for plotting""" from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) dfr = DplyFrame(output) dfr = (dfr >> group_by(X.word, X.source) >> summarize(tot=X.count.sum())) dff = (dfr >>select(X.word, X.tot )) """5.2 wordcloud""" """turns the word freq to dict""" d = {} for a, x in dff.values: d[a] = x wordcloud = WordCloud(width = 1000, height = 1000, background_color ='white', min_font_size =15, max_font_size=120).generate_from_frequencies(frequencies=d) plt.figure(figsize = (8, 8), facecolor = None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) plt.show()
denominator = np.max(data, 0) - np.min(data, 0) return numerator / (denominator + 1e-7) # train Parameters seq_length = 60 data_dim = 8 hidden_dim = 10 output_dim = 1 learning_rate = 0.01 iterations = 500 # last, diff_24h, diff_per_24h, bid, ask, low, high, volume data = DplyFrame(pd.read_csv('./bitcoin_ticker.csv', delimiter=',')) data = data >> sift(X.rpt_key == 'btc_krw') >> select( X.last, X.diff_24h, X.diff_per_24h, X.bid, X.ask, X.low, X.high, X.volume) data = np.asarray(data) #data = MinMaxScaler(data) data = tf.layers.batch_normalization(data) x = data y = data[:, [0]] # last as label # build a dataset dataX = [] dataY = [] for i in range(0, len(y) - seq_length): _x = x[i:i + seq_length] _y = y[i + seq_length] # Next close price print(_x, "->", _y) dataX.append(_x) dataY.append(_y)
def czMatchmaker(data, Q, precursor_fasta): data = pd.read_csv( "/Users/matteo/Documents/czMatchmaker/data/examplaryData.csv") data = DplyFrame(data) precursors = data >> \ sift( X.tag == 'precursor' ) >> \ select( X.active, X.neutral, X.estimates) fragments = data >> sift( X.tag != 'precursor' ) >> \ group_by( X.tag, X.active, X.broken_bond ) >> \ summarize( estimates = X.estimates.sum() ) I_on_fragments = {} optiminfos = {} for break_point, data in fragments.groupby('broken_bond'): pairing, optiminfo = collect_fragments(data, Q) I_on_fragments[break_point] = pairing optiminfos[break_point] = optiminfo cations_fragmented_I = sum( sum(I_on_fragments[bP][p] for p in I_on_fragments[bP]) for bP in I_on_fragments) I_no_reactions = precursors >> \ sift( X.active==Q, X.neutral == 0) >> \ select( X.estimates ) I_no_reactions = I_no_reactions.values.flatten()[0] prec_ETnoD_PTR_I = precursors >> \ sift( X.active != Q ) >> \ rename( ETnoD = X.neutral, I = X.estimates ) >> \ mutate( PTR = Q - X.ETnoD - X.active ) >> \ select( X.ETnoD, X.PTR, X.I ) I_prec_no_frag = prec_ETnoD_PTR_I >> \ summarize( I = X.I.sum() ) I_prec_no_frag = I_prec_no_frag.values.flatten()[0] precursorNoReactions = precursors >> \ sift( X.active == Q ) >> \ select( X.estimates ) prec_ETnoD_PTR_I = prec_ETnoD_PTR_I >> mutate( I_PTR = crossprod(X.PTR, X.I), \ I_ETnoD = crossprod(X.ETnoD, X.I) ) >> \ summarize( I_PTR = X.I_PTR.sum(), I_ETnoD = X.I_ETnoD.sum() ) I_PTR_no_frag, I_ETnoD_no_frag = prec_ETnoD_PTR_I.values.flatten() prob_PTR = I_PTR_no_frag / (I_PTR_no_frag + I_ETnoD_no_frag) prob_ETnoD = 1. - prob_PTR I_frags = dict( (bP, sum(I_on_fragments[bP][pairing] for pairing in I_on_fragments[bP])) for bP in I_on_fragments) I_frag_total = sum(I_frags[bP] for bP in I_frags) prob_frag = Counter( dict((int(bP), I_frags[bP] / I_frag_total) for bP in I_frags)) prob_frag = [prob_frag[i] for i in range(len(precursor_fasta))] I_frags_PTRETnoD_total = sum( (Q - 1 - sum(q for cz, q in pairing)) * I_on_fragments[bP][pairing] for bP in I_on_fragments for pairing in I_on_fragments[bP]) anion_meets_cation = I_frags_PTRETnoD_total + I_PTR_no_frag + I_ETnoD_no_frag prob_fragmentation = I_frags_PTRETnoD_total / anion_meets_cation prob_no_fragmentation = 1 - prob_fragmentation prob_no_reaction = I_no_reactions / (I_no_reactions + I_frag_total + I_prec_no_frag) prob_reaction = 1. - prob_no_reaction res = {} res['reaction'] = (prob_reaction, prob_no_reaction) res['fragmentation'] = (prob_fragmentation, prob_no_fragmentation) res['fragmentation_amino_acids'] = tuple(prob_frag) return res
for i, into_col in enumerate(sp_into): df[into_col] = [ row[i] if len(row) > i else None for row in splitcol ] columns = list(df.columns) reorder_columns = columns[:columns.index(sp_col)] + sp_into + columns[ (columns.index(sp_col) + 1):-len(into_col) - 1] return df[reorder_columns] def __rrshift__(self, other): return self.__call__(DplyFrame(other.copy(deep=True))) if __name__ == '__main__': mtcars = read_tsv('test/data/mtcars.tsv') mtcars = mtcars >> select(X.name, X.mpg, X.cyl) d = zip(map(str, mtcars['name']), map(str, mtcars['mpg']), map(str, mtcars['cyl'])) d = ['|'.join(x) for x in d] mtcars['name'] = d mtcars = mtcars >> select(X.name) mtcars_clean = mtcars >> separate(X.name, ['name', 'mpg', 'cyl'], ' ') print(mtcars_clean >> head())
import pandas from dplython import (DplyFrame, X, diamonds, select, sift, sample_n, sample_frac, head, arrange, mutate, group_by, summarize, DelayFunction) diamonds >> head(5) diamonds >> select(X.carat, X.cut, X.price) >> head(5) d = (diamonds >> sift(X.carat > 4) >> select(X.carat, X.cut, X.depth, X.price) >> head(2)) (diamonds >> mutate(carat_bin=X.carat.round()) >> group_by(X.cut, X.carat_bin) >> summarize(avg_price=X.price.mean())) test = df['deaths'] < 0 less_than_zero = df[test] print(less_than_zero.shape) print(less_than_zero.head()) test #df['deaths_fixed'] = df['deaths_new'].apply(lambda x: 'True' if x <= 0 else 'False')
def load_data(input_dir, crsrd_id): cctv_log = pd.read_csv(input_dir + "/ORT_CCTV_5MIN_LOG.csv") cctv_mst = pd.read_csv(input_dir + "/ORT_CCTV_MST.csv") cctv_log['DATE'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).date) cctv_log['HOUR'] = pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).hour) cctv_log['MINUTE'] = ( pd.DataFrame(pd.DatetimeIndex(cctv_log['REG_DT']).minute) // 30) * 30 cctv_log['temp_DAY'] = pd.to_datetime(cctv_log['DATE']).dt.dayofweek cctv_log.loc[cctv_log['temp_DAY'] < 5, 'DAY'] = int(0) #mon - fri cctv_log.loc[cctv_log['temp_DAY'] == 5, 'DAY'] = int(1) #sat cctv_log.loc[cctv_log['temp_DAY'] == 6, 'DAY'] = int(2) #sun df0 = DplyFrame(cctv_log) >> group_by( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.CCTV_ID) >> summarize( GO_TRF=X.GO_BIKE.sum() + X.GO_CAR.sum() + X.GO_SUV.sum() + X.GO_VAN.sum() + X.GO_TRUCK.sum() + X.GO_BUS.sum() + X.RIGHT_BIKE.sum() + X.RIGHT_CAR.sum() + X.RIGHT_SUV.sum() + X.RIGHT_VAN.sum() + X.RIGHT_TRUCK.sum() + X.RIGHT_BUS.sum(), LEFT_TRF=X.LEFT_BIKE.sum() + X.LEFT_CAR.sum() + X.LEFT_SUV.sum() + X.LEFT_VAN.sum() + X.LEFT_TRUCK.sum() + X.LEFT_BUS.sum()) # Extract records of selected crossroad cctv_mst = DplyFrame(cctv_mst) >> sift(X.CRSRD_ID == crsrd_id) >> select( X.CRSRD_ID, X.CCTV_ID) df0 = pd.merge(df0, cctv_mst, how="inner", on="CCTV_ID") df0 = df0.sort_values(['DATE', 'HOUR', 'MINUTE', 'CCTV_ID']) # Time frame from existing dataset tf = DplyFrame( df0.drop_duplicates( ['DATE', 'DAY', 'HOUR', 'MINUTE'], keep='last')) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE) # Process the datastructure into pivot cctv_list = sorted(cctv_mst['CCTV_ID'].unique()) df1 = tf for cctv in cctv_list: a = df0 >> sift(X.CCTV_ID == cctv) >> select( X.DATE, X.DAY, X.HOUR, X.MINUTE, X.GO_TRF, X.LEFT_TRF) df1 = pd.merge(df1, a, how='left', on=['DATE', 'DAY', 'HOUR', 'MINUTE'], suffixes=('', '_' + str(cctv))) df1 = df1.set_index(['DATE', 'DAY', 'HOUR', 'MINUTE']) df1 = df1.fillna(df1.rolling(window=24, min_periods=1, center=True).mean()) df1 = df1.fillna(0) df1 = df1.reset_index() df1['TOTAL_TRF'] = DplyFrame(df1.iloc[:, 4:3 + len(cctv_list) * 2].sum( axis=1, skipna=True)) df1 = df1 >> sift(X.TOTAL_TRF > 0) print(df1) # Name the cctv id and direction - for tod_traffic_analyzer cols = [cctv + '_GO_RATE' for cctv in cctv_list] cols.extend([cctv + '_LEFT_RATE' for cctv in cctv_list]) cols = sorted(cols) cols = ['TOD'] + cols + ['TOTAL_TRF'] return df1, cols
for match_id, match_df in large_df_in.groupby(['match_id']): x_mat = x_mat_in >> sift(X.match_id == match_id) x_mat = x_mat.drop(columns=["match_id"]) fit_arima_model(df_in=match_df, x_mat_in=x_mat, order_in=(1, 1, 0), coefficients_dict=coefficients, feature_set=feature_set_in, match_id=str(match_id)) print(match_id) json_filename = "arima_" + str(feature_set_in) + ".json" export_coefficients(coefficients, json_filename) if __name__ == "__main__": longform_df = DplyFrame( pd.read_csv("../../LongForm/longform.csv", dtype={'shorthand_search_vol': float})) stage_2_df = process_data(longform_df) x_mat = stage_2_df >> select(stage_2_df.match_id, stage_2_df.home_goal, stage_2_df.away_goal, stage_2_df.home_yellow, stage_2_df.away_yellow, stage_2_df.home_red, stage_2_df.away_red) # run model with 1st feature set run_arima_models(stage_2_df, x_mat, 1)