def session_aggr(sf, cols, key="session_code"):
    mean_operations = {("%s_mean" % col): agg.MEAN(col) for col in cols}
    std_operations = {("%s_std" % col): agg.STD(col) for col in cols}
    min_operations = {("%s_min" % col): agg.MIN(col) for col in cols}
    max_operations = {("%s_max" % col): agg.MAX(col) for col in cols}
    all_operations = {}
    all_operations.update(mean_operations)
    all_operations.update(std_operations)
    all_operations.update(min_operations)
    all_operations.update(max_operations)
    return sf.groupby(key_column_names=[key], operations=all_operations)
def get_agg_cols(postfix,
                 agg_type,
                 agg_cols=['not_skipped', 'skip_1', 'skip_2', 'skip_3']):
    if agg_type == "mean":
        return {("%s_mean_%s" % (col, postfix)): agg.MEAN(col)
                for col in agg_cols}
    elif agg_type == "sum":
        return {("%s_sum_%s" % (col, postfix)): agg.SUM(col)
                for col in agg_cols}
    elif agg_type == "count":
        return {("cnt_%s" % postfix): agg.COUNT()}
    else:
        raise RuntimeError("Aggregation is not supported by this function!")
Ejemplo n.º 3
0
    def predict(self):
        all = []
        with open(self.list_loc, 'r') as fp:
            list = fp.read().splitlines()
            for i in list:
                df = self.download_news(i, 1)
                all.append(df)

        data = pd.concat(all, ignore_index=True)
        print(data)
        sf = tc.SFrame(data)

        model = tc.load_model(self.model_loc)
        # Save predictions to an SArray
        predictions = model.predict(sf)
        sf['prediction'] = predictions
        #sf.explore()
        trade_list = sf.groupby(key_column_names='stock',
                                operations={
                                    'avg': agg.MEAN('prediction'),
                                    'count': agg.COUNT()
                                })
        #trade_list['label'] = trade_list.apply(lambda x: 'rise' if (x['avg'] >= 0.8 and x['count'] >= 10) else 'drop')
        self.shortlist = trade_list.to_dataframe()

def get_dists(sf, cols, variance_dict):
    for i, col in enumerate(cols):
        s_err = np.square(sf[col] - sf["%s_MEAN" % col]) / variance_dict[col]
        if i == 0:
            s_err_sum = s_err
        else:
            s_err_sum += s_err
        print(col, (i + 1) / len(cols))
    return np.sqrt(s_err_sum / len(cols))


track_means = session_data.groupby(
    "session_code",
    operations={"%s_MEAN" % col: agg.MEAN(col)
                for col in track_feats})
session_data = batch_join(session_data, track_means, ["session_code"])
del track_means

var_dict = dict(zip(track_feats, track_feats_variance))
session_data["dist_from_sess_mean"] = get_dists(session_data, track_feats,
                                                var_dict)
session_data = session_data.remove_columns(
    ["%s_MEAN" % col for col in track_feats])

print("## v.) aggregations for total session")

agg_cols_total = session_data.column_names().copy()
session_cols = [
    'session_position', 'session_length', 'context_switch',
Ejemplo n.º 5
0
import turicreate as tc
import turicreate.aggregate as agg

sales = tc.SFrame.read_csv('data/home_data.csv')
train_data, test_data = sales.random_split(.8, seed=0)

# highest_price = sales['price'].max()
# zipcode = sales[sales['price'] == highest_price]['zipcode']
# neighborhood = sales[sales['zipcode'] == zipcode[0]]
# avg = neighborhood['price'].mean()

stats = sales.groupby('zipcode', operations={'mean': agg.MEAN('price')})
zipcode = stats[stats['mean'] == stats['mean'].max()]['zipcode'][0]
print(stats['mean'].max())

# filtered = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]
filtered = sales[sales['sqft_living'].apply(lambda sqft: 2000 < sqft < 4000)]
print('fraction: {f:.2}'.format(f=len(filtered)/len(sales)))

features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode', 'condition',
    'grade', 'waterfront', 'view', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long',
    'sqft_living15', 'sqft_lot15'
]

model = tc.linear_regression.create(train_data, features=features, target='price', validation_set=None)
advanced_model = tc.linear_regression.create(train_data, features=advanced_features, target='price', validation_set=None)

model_result = model.evaluate(dataset=test_data)
        calc_distances = True
        break
print("calculate dist_from_sess_mean:", calc_distances)

def get_dists(sf, cols, variance_dict):
    for i, col in enumerate(cols):
        s_err = np.square(sf[col]-sf["%s_MEAN" % col]) / variance_dict[col]
        if i == 0:
            s_err_sum = s_err
        else:
            s_err_sum += s_err
        print(col, (i+1) / len(cols))
    return np.sqrt(s_err_sum / len(cols))

if calc_distances:
    track_means = session_data.groupby("session_code", operations={"%s_MEAN" % col : agg.MEAN(col) for col in track_feats})
    session_data = batch_join(session_data, track_means, ["session_code"])
    del track_means
    var_dict = dict(zip(track_feats, track_feats_variance))
    session_data["dist_from_sess_mean"] = get_dists(session_data, track_feats, var_dict)
    session_data = session_data.remove_columns(["%s_MEAN" % col for col in track_feats])
    print("'dist_from_sess_mean' generated!")
else:
    print("'dist_from_sess_mean' skipped!")

track_cols_to_remove = list(set(track_feats)-set(track_cols))
session_data = session_data.remove_columns(track_cols_to_remove)
print("track cols to remove:", track_cols_to_remove)

print("## v.) aggregations for total session")