def _progress_multi_combiner(results): res = results.unpack('metadata').unpack('parameters') metadatas = [c for c in res.column_names() if c.startswith('metadata')] context = [c for c in res.column_names() if c.startswith('parameters')] # Unpack metrics if possible try: res = res.unpack('metric') metrics = [c for c in res.column_names() if c.startswith('metric')] except: metrics = ['metric'] pass # Do nothing metadatas.sort() context.sort() metrics.sort() res = res[metadatas + context + metrics] # Get aggregators for all metrics aggs = {} for m in metrics: aggs['mean_' + m] = _agg.MEAN(m) for m in metadatas: aggs[m] = _agg.CONCAT(m) aggs['num_folds'] = _agg.COUNT() res = res.groupby(context, aggs) # Clean up column names for s in ['parameters.', 'metric.', 'metadata.']: res = res.rename({c: c.replace(s, '') for c in res.column_names()}) return res
def aggreagate_res(data_folder, res_path): res = SFrame() for f in os.listdir(data_folder): temp_sf = SFrame.read_csv(data_folder + "\\" + f, column_type_hints={"prob": float}) res = res.append(temp_sf) res = res.groupby("src_id", operations={"prob": aggregate.MEAN('prob')}) # res["actual"] = res["actual"].apply(lambda x: "P" if x == 1 else "N") res.save(res_path, 'csv')
def create_item_features(df): # Getting Item mean df item_rating_mean_df = df[['business_id', 'rating']] item_rating_mean_df = item_rating_mean_df.groupby( key_columns='business_id', operations={ 'mean_rating': agg.MEAN('rating'), 'std_rating': agg.STD('rating'), 'distinct_rating': agg.COUNT_DISTINCT('rating'), 'count': agg.COUNT('rating') }) item_features = gl.SFrame(item_rating_mean_df) return item_features
def create_user_features(df): # Getting User Mean df user_rating_mean_df = df[['user_id', 'rating']] user_rating_mean_df = user_rating_mean_df.groupby( key_columns='user_id', operations={ 'mean_rating': agg.MEAN('rating'), 'std_rating': agg.STD('rating'), 'distinct_rating': agg.COUNT_DISTINCT('rating'), 'count': agg.COUNT('rating') }) user_features = gl.SFrame(user_rating_mean_df) return user_features
def agg_traffic_24_hours(sfdf): """ Agg data according to recording hour. This means calculating the mean of the data over a given hour, e.g. 23:00 taking the mean of all data recorded at 23:00 each day so that it is possible to compare traffic on per hour basis or the recording time. """ opsv2 = {'SMSsIn': agg.MEAN('smsin_tot'), 'SMSsout': agg.MEAN('smsout_tot'), 'CallsIn': agg.MEAN('callin_tot'), 'CallsOut': agg.MEAN('callout_tot'), 'WebTraff': agg.MEAN('web_tot') } sfdf['hour'] = sfdf['time'].apply(lambda x: dt.utcfromtimestamp(x/1e3). strftime('%H:%M:%S')) sf = gp.SFrame(data=sfdf[['callin_tot', 'callout_tot', 'smsin_tot', 'web_tot', 'smsout_tot', 'hour']]) sf_grouped = sf.groupby('hour', operations=opsv2) sfdf = sf_grouped.sort('hour').to_dataframe().set_index('hour') sfdf.index = pd.to_datetime(sfdf.index) sfdf = sfdf.resample('H').sum() return np.log(sfdf.aggregate('sum', axis='columns'))
import graphlab as gl import graphlab.aggregate as agg data_in_path = "/home/warreee/projects/2016-SS-Assignments/Assignment2/Tableau/raw_data/" data_out_path = "/home/warreee/projects/2016-SS-Assignments/Assignment2/Tableau/clean_data/" agora = gl.SFrame.read_csv(data_in_path + "agora1.txt", delimiter=' ', header=False) agora['X2'] = agora['X2'].apply(lambda x: x.replace(',', '')) agora['X3'] = agora['X3'].apply(lambda x: ':'.join(x.split(':')[0:2])) agora = agora.groupby(key_columns=['X3', 'X2'], operations={ 'average': agg.MEAN('X1') }).sort(sort_columns=['X2', 'X3']) agora.save(data_out_path + "agora.csv", format='csv') biomedisch = gl.SFrame.read_csv(data_in_path + "biomedisch1.txt", delimiter=' ', header=False) biomedisch['X2'] = biomedisch['X2'].apply(lambda x: x.replace(',', '')) biomedisch['X3'] = biomedisch['X3'].apply( lambda x: ':'.join(x.split(':')[0:2])) biomedisch = biomedisch.groupby(key_columns=['X3', 'X2'], operations={ 'average': agg.MEAN('X1') }).sort(sort_columns=['X2', 'X3']) biomedisch.save(data_out_path + "biomedisch.csv", format='csv') cba = gl.SFrame.read_csv(data_in_path + "cba1.txt", delimiter=' ',
Spyder Editor This is a temporary script file. """ import graphlab import graphlab.aggregate as agg sales = graphlab.SFrame('Course1/Week2/home_data.gl/') sales.head() ### Qn 1 ### Zipcode with highest avg house price df = sales.groupby(key_columns='zipcode', operations={'price': agg.MEAN('price')}) df = df.sort(['price']) df.tail() ### zipcode - 98039 & price - 2160606.6 ### Qn 2 sales.shape ### 21613 rows & 21 columns df = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)] df.shape ### 9111 rows & 21 columns fraction = 9111.0/21613.0 ### Qn 3