def session_aggr(sf, cols, key="session_code"): mean_operations = {("%s_mean" % col): agg.MEAN(col) for col in cols} std_operations = {("%s_std" % col): agg.STD(col) for col in cols} min_operations = {("%s_min" % col): agg.MIN(col) for col in cols} max_operations = {("%s_max" % col): agg.MAX(col) for col in cols} all_operations = {} all_operations.update(mean_operations) all_operations.update(std_operations) all_operations.update(min_operations) all_operations.update(max_operations) return sf.groupby(key_column_names=[key], operations=all_operations)
def get_agg_cols(postfix, agg_type, agg_cols=['not_skipped', 'skip_1', 'skip_2', 'skip_3']): if agg_type == "mean": return {("%s_mean_%s" % (col, postfix)): agg.MEAN(col) for col in agg_cols} elif agg_type == "sum": return {("%s_sum_%s" % (col, postfix)): agg.SUM(col) for col in agg_cols} elif agg_type == "count": return {("cnt_%s" % postfix): agg.COUNT()} else: raise RuntimeError("Aggregation is not supported by this function!")
def predict(self): all = [] with open(self.list_loc, 'r') as fp: list = fp.read().splitlines() for i in list: df = self.download_news(i, 1) all.append(df) data = pd.concat(all, ignore_index=True) print(data) sf = tc.SFrame(data) model = tc.load_model(self.model_loc) # Save predictions to an SArray predictions = model.predict(sf) sf['prediction'] = predictions #sf.explore() trade_list = sf.groupby(key_column_names='stock', operations={ 'avg': agg.MEAN('prediction'), 'count': agg.COUNT() }) #trade_list['label'] = trade_list.apply(lambda x: 'rise' if (x['avg'] >= 0.8 and x['count'] >= 10) else 'drop') self.shortlist = trade_list.to_dataframe()
def get_dists(sf, cols, variance_dict): for i, col in enumerate(cols): s_err = np.square(sf[col] - sf["%s_MEAN" % col]) / variance_dict[col] if i == 0: s_err_sum = s_err else: s_err_sum += s_err print(col, (i + 1) / len(cols)) return np.sqrt(s_err_sum / len(cols)) track_means = session_data.groupby( "session_code", operations={"%s_MEAN" % col: agg.MEAN(col) for col in track_feats}) session_data = batch_join(session_data, track_means, ["session_code"]) del track_means var_dict = dict(zip(track_feats, track_feats_variance)) session_data["dist_from_sess_mean"] = get_dists(session_data, track_feats, var_dict) session_data = session_data.remove_columns( ["%s_MEAN" % col for col in track_feats]) print("## v.) aggregations for total session") agg_cols_total = session_data.column_names().copy() session_cols = [ 'session_position', 'session_length', 'context_switch',
import turicreate as tc import turicreate.aggregate as agg sales = tc.SFrame.read_csv('data/home_data.csv') train_data, test_data = sales.random_split(.8, seed=0) # highest_price = sales['price'].max() # zipcode = sales[sales['price'] == highest_price]['zipcode'] # neighborhood = sales[sales['zipcode'] == zipcode[0]] # avg = neighborhood['price'].mean() stats = sales.groupby('zipcode', operations={'mean': agg.MEAN('price')}) zipcode = stats[stats['mean'] == stats['mean'].max()]['zipcode'][0] print(stats['mean'].max()) # filtered = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)] filtered = sales[sales['sqft_living'].apply(lambda sqft: 2000 < sqft < 4000)] print('fraction: {f:.2}'.format(f=len(filtered)/len(sales))) features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode'] advanced_features = [ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'condition', 'grade', 'waterfront', 'view', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15' ] model = tc.linear_regression.create(train_data, features=features, target='price', validation_set=None) advanced_model = tc.linear_regression.create(train_data, features=advanced_features, target='price', validation_set=None) model_result = model.evaluate(dataset=test_data)
calc_distances = True break print("calculate dist_from_sess_mean:", calc_distances) def get_dists(sf, cols, variance_dict): for i, col in enumerate(cols): s_err = np.square(sf[col]-sf["%s_MEAN" % col]) / variance_dict[col] if i == 0: s_err_sum = s_err else: s_err_sum += s_err print(col, (i+1) / len(cols)) return np.sqrt(s_err_sum / len(cols)) if calc_distances: track_means = session_data.groupby("session_code", operations={"%s_MEAN" % col : agg.MEAN(col) for col in track_feats}) session_data = batch_join(session_data, track_means, ["session_code"]) del track_means var_dict = dict(zip(track_feats, track_feats_variance)) session_data["dist_from_sess_mean"] = get_dists(session_data, track_feats, var_dict) session_data = session_data.remove_columns(["%s_MEAN" % col for col in track_feats]) print("'dist_from_sess_mean' generated!") else: print("'dist_from_sess_mean' skipped!") track_cols_to_remove = list(set(track_feats)-set(track_cols)) session_data = session_data.remove_columns(track_cols_to_remove) print("track cols to remove:", track_cols_to_remove) print("## v.) aggregations for total session")