def user_agg(si=None): ''' Loads search.gl and aggregates it by UserID to get some features. NB: this did not help. ''' start = datetime.now() if si is None: si = load('search.gl') D = 2**20 si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '') si['SQhash'] = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D) si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None) f = {'pctSQE' : agg.AVG('SQexists'), 'pctSPE' : agg.AVG('SPexists'), 'numSearches' : agg.COUNT(), 'allCat' : agg.CONCAT('CategoryID'), 'allSQ' : agg.CONCAT('SQhash')} si = si[['UserID', 'CategoryID', 'SearchParams', 'SQexists', 'SPexists', 'SQhash']] usr = si.groupby('UserID', f) usr['allSQ'] = usr['allSQ'].apply(lambda l : list(set(l))) usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l))) usr_dict = sframe_to_dict('UserID', usr) avito2_io.put_artifact(usr_dict, 'user_si.pkl') print('elapsed time: %s' % (datetime.now() - start))
def _progress_multi_combiner(results): res = results.unpack('metadata').unpack('parameters') metadatas = [c for c in res.column_names() if c.startswith('metadata')] context = [c for c in res.column_names() if c.startswith('parameters')] # Unpack metrics if possible try: res = res.unpack('metric') metrics = [c for c in res.column_names() if c.startswith('metric')] except: metrics = ['metric'] pass # Do nothing metadatas.sort() context.sort() metrics.sort() res = res[metadatas + context + metrics] # Get aggregators for all metrics aggs = {} for m in metrics: aggs['mean_' + m] = _agg.MEAN(m) for m in metadatas: aggs[m] = _agg.CONCAT(m) aggs['num_folds'] = _agg.COUNT() res = res.groupby(context, aggs) # Clean up column names for s in ['parameters.', 'metric.', 'metadata.']: res = res.rename({c: c.replace(s, '') for c in res.column_names()}) return res
def weather_stations(sensor_path): """ Get the weather stations and the amount of sensors in each station. """ sensors = get_data_from_csv(sensor_path) # '../data/mi_sensors/' st = sensors.groupby(key_columns='street', operations={'sensors': agg.COUNT()}) return st
def pollution_sens_stations(path_sensors): """ Get the pollution stations and the amount of sensors in each station. """ sen = get_data_from_csv(path_sensors) # full path for single file st = sen.groupby(key_columns='street_name', operations={'sensors': agg.COUNT()}) return st
def create_user_features(df): # Getting User Mean df user_rating_mean_df = df[['user_id', 'rating']] user_rating_mean_df = user_rating_mean_df.groupby( key_columns='user_id', operations={ 'mean_rating': agg.MEAN('rating'), 'std_rating': agg.STD('rating'), 'distinct_rating': agg.COUNT_DISTINCT('rating'), 'count': agg.COUNT('rating') }) user_features = gl.SFrame(user_rating_mean_df) return user_features
def create_item_features(df): # Getting Item mean df item_rating_mean_df = df[['business_id', 'rating']] item_rating_mean_df = item_rating_mean_df.groupby( key_columns='business_id', operations={ 'mean_rating': agg.MEAN('rating'), 'std_rating': agg.STD('rating'), 'distinct_rating': agg.COUNT_DISTINCT('rating'), 'count': agg.COUNT('rating') }) item_features = gl.SFrame(item_rating_mean_df) return item_features
def LPA(g, src): #group the neighbors according to the labels #count the labels #start=time.time() #src_max = #nseSrc = g.get_edges(src_ids=[src]) #nseDst = g.get_edges(dst_ids=[src]) nseOr = g.get_edges() #print(nseOr) merge = nseOr[(nseOr['__src_id'] == src) | (nseOr['__dst_id'] == src)] #print(merge)#merge = nseSrc.append(nseDst) #print(merge['__src_id']) #print(merge[(merge['__src_id'] != src) & (merge['__dst_id'] != src)])# or merge['__dst_id'] != src] #print(merge) merge2 = merge[merge['__src_id'] != src] #print(merge2) arr1 = merge2['__src_id'] #print(arr1) #print(merge) merge3 = merge[merge['__dst_id'] != src] arr2 = merge3['__dst_id'] #print(arr2) arr1 = arr1.append(arr2) #print(arr1) nsv = g.get_vertices(ids=arr1) #print(nsv) l_count = nsv.groupby(key_columns='label', operations={'count': aggregate.COUNT()}) #print(l_count) maxim = l_count['count'].max() #print(maxim) max_labels = l_count[l_count['count'] == maxim]['label'] #print(len(max_labels)) #end=time.time() #print(end-start) #print(max_labels) #print(max_labels) #start=time.time() random_label = rn.sample(range(0, len(max_labels)), 1) #print(max_labels[random_label[0]]) #if s.get_vertices return max_labels[random_label[0]]
def count(g): gf = g.vertices l_count = gf.groupby(key_columns='label', operations={'count': aggregate.COUNT()}) return l_count
def __init__(self, user_data='', item_data='', rating_data=''): if user_data: self._users = gl.SFrame.read_json(user_data, orient='records') self._users = self._users.unique() if item_data: self._items = gl.SFrame.read_json(item_data, orient='records') self._items = self._items.unique() if rating_data: self._ratings = gl.SFrame.read_json(rating_data, orient='records') self._ratings = self._ratings.unique() reviewsPerUser = self._ratings.groupby(key_columns='user_id', operations={'qtd': agg.COUNT()}) topReviewers = reviewsPerUser[reviewsPerUser['qtd'] >= 5] topReviewers['marker'] = 1 joined = self._ratings.join(topReviewers, on='user_id', how='left') self._ratings = joined[joined['marker'] == 1] self._ratings.remove_column('marker') self._ratings.remove_column('qtd') # Normalize ratings maxValue = max(self._ratings['rating']) range = 1.0/maxValue print("Max value!!!!!!!!!") print(maxValue) self._ratings['rating'] = self._ratings['rating'].apply(lambda x: x * range) self._train, self._test = gl.recommender.util.random_split_by_user(self._ratings) print("deug!!!!!!!!!") print(self._train) print("test!!!!!!!!!") print(self._test)
# g2 = gn['genre2'].unique() # g3 = gn['genre3'].unique() genres = { 'genre1': gn['genre1'].unique(), 'genre2': gn['genre2'].unique(), 'genre3': gn['genre3'].unique() } joined = scrobbles.join(gn, on='songID') joined['ts'] = joined['ts'].apply(lambda x: parse(x)) ts = gl.TimeSeries(joined, index='ts') ts.save('ts') total_listens = ts.resample(dt.timedelta(days=1), agg.COUNT()) total_listens.save(rootdir + '_total_listens') for level in ('genre1', 'genre2', 'genre3'): n = len(genres[level]) for i, genre in enumerate(genres[level]): current = ts[ts[level] == genre].resample(dt.timedelta(days=1), agg.COUNT()) #current.save(rootdir+level+'_'+genre) current.to_sframe().to_dataframe().to_pickle(rootdir + level + '_' + genre.replace('/', '-') + '.pkl') print "{} - {} ({}/{})".format(level, genre, i + 1, n)
def tweeting_language_popularity(sf): """Get language popularity.""" return sf.groupby('language', operations={'tweets': agg.COUNT()} ).sort('tweets', ascending=False)
itera += 1 # if init: #first call sf = gl.SFrame.read_csv(file, verbose=False) init = False else: sf = sf.append(gl.SFrame.read_csv(file, verbose=False)) # print(itera, " File : ", file, "| Size > ", sf.num_cols(), sf.num_rows()) # In[26]: import graphlab.aggregate as agg ngram_count = sf.groupby(key_columns='kl', operations={'numberofcalls': agg.COUNT()}) # In[27]: # ngram_count = ngram_count.sort('numberofcalls', ascending = False) # In[28]: def plotsf(sf, thetitle): import numpy as np import matplotlib.pyplot as plt fig = matplotlib.pyplot.gcf() fig.set_size_inches(15.5, 10.5)
@author: Ananthu """ import graphlab import graphlab.aggregate as agg import pandas as pd image_train = graphlab.SFrame('Course1/Week6/image_train_data/') image_test = graphlab.SFrame('Course1/Week6/image_test_data/') image_train.head() image_test.head() ### Qn 1 image_train.groupby('label', operations={'count': agg.COUNT('label')}) ### Qn 2 & 3 image_train_bird = image_train[image_train['label'] == 'bird'] image_train_dog = image_train[image_train['label'] == 'dog'] image_train_cat = image_train[image_train['label'] == 'cat'] image_train_auto = image_train[image_train['label'] == 'automobile'] knn_model_bird = graphlab.nearest_neighbors.create(image_train_bird, features=['deep_features'], label='id') knn_model_dog = graphlab.nearest_neighbors.create(image_train_dog, features=['deep_features'], label='id') knn_model_cat = graphlab.nearest_neighbors.create(image_train_cat,
# Dumping Data into csv df_text.to_csv("df_text.csv") # Dropping dataframe from memory df_text.drop(df_text.index, inplace=True) # Importing saved csv file in graphlab sframe sf_text = gl.SFrame.read_csv("df_text.csv") # Obtaining the tweet count for each user user_count = sf_text.groupby(key_columns='user_id',operations={'count': agg.COUNT()}) # Selecting users with more than 10 tweets for processing user_count = user_count[user_count['count'] > 10] sf_text.remove_column('X1') # Obtaining the tweets for only selected users final_sf=sf_text.join(user_count,on='user_id', how='inner') final_sf.remove_column('count') # saving data for processing in spark
print("{}: {} = {} {} = {}.".format("QUESTION 2", "\nArtist with high play count", artist_playcnt_high, "\nArtist with low play count", artist_playcnt_low)) ##======= Question 3 =======## train_data, test_data = sf.random_split(0.8, seed=0) # create similarity model sim_model = gl.item_similarity_recommender.create(train_data, user_id='user_id', item_id='song') # subset users subset_test_users = test_data['user_id'].unique()[0:10000] # recommend a song to each 10k users subset_song_rec = sim_model.recommend(users=subset_test_users, k=1) song_pred_cnt = subset_song_rec.groupby(key_columns='song', operations={'rec_count': agg.COUNT()}) most_rec_song = song_pred_cnt.sort('rec_count', ascending=False)['song'][0] least_rec_song = song_pred_cnt.sort('rec_count', ascending=False)['song'][-1] print("{}: {} = {} {} = {}.".format("QUESTION 3", "\nMost recommend song to top 10k users", most_rec_song, "\nLeast recommend song to top 10k user", least_rec_song))
scatter_matrix(df_reader, alpha=0.05, figsize=(15, 15)); #df_MV=pd.read_csv('pivotids.csv',delimiter="|") #df_CQ=pd.read_csv('pivotids_CHECKQUANT.csv',delimiter="|") #pd.merge(tablo,df_full, how='inner', on=['DTRANSACT', 'SKLAD','NOMGR'],suffixes=('_left', '_right')) #df_Quant=pd.read_csv('pivotids_QUANT.csv',delimiter="|") pd.merge(df_CQ,df_MV, how='outer', on=['RECENCY', 'MINDATE','CUR_AGE','PHONE_ids','Response'],left_on=) import graphlab as gl read_file=gl.SFrame.read_csv('pivotids_input.csv',delimiter='|') #read_file=read_file.to_dataframe() import graphlab.aggregate as agg neww=read_file.groupby(key_columns='PHONE_ids', operations={'count': agg.COUNT()}) neww[neww['count'] > 1] read_file.remove_column('MONETARY_VALUE_SEGM_TOVARKATEG_ENG_NOMGR') read_file.remove_column('CHECKQUANT_SEGM_TOVARKATEG_ENG_NOMGR') read_file.remove_column('QUANT_SEGM_TOVARKATEG_ENG_NOMGR') read_file.remove_column('QUANT_') read_file.remove_column('CHECKQUANT_') read_file.remove_column('MONETARY_VALUE_') #read_file=read_file.iloc[:,:-2].replace(np.nan, 0) "','".join(read_file.column_names()) user_schema = { 'conversion_status': 'Response',