Beispiel #1
0
def user_agg(si=None):
  '''
  Loads search.gl and aggregates it by UserID to get some features.
  NB: this did not help.
  '''
  start = datetime.now()
  if si is None:
    si = load('search.gl')
  D = 2**20
  si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '')
  si['SQhash']   = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D)
  si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None)
  
  f = {'pctSQE'      : agg.AVG('SQexists'),
       'pctSPE'      : agg.AVG('SPexists'),
       'numSearches' : agg.COUNT(),
       'allCat'      : agg.CONCAT('CategoryID'),
       'allSQ'       : agg.CONCAT('SQhash')}
       
  si = si[['UserID', 
           'CategoryID', 
           'SearchParams', 
           'SQexists', 
           'SPexists', 
           'SQhash']]
  usr = si.groupby('UserID', f)
  usr['allSQ']  = usr['allSQ'].apply(lambda l : list(set(l)))
  usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l)))
  usr_dict = sframe_to_dict('UserID', usr)
  avito2_io.put_artifact(usr_dict, 'user_si.pkl')
  print('elapsed time: %s' % (datetime.now() - start))
Beispiel #2
0
def _progress_multi_combiner(results):
    res = results.unpack('metadata').unpack('parameters')
    metadatas = [c for c in res.column_names() if c.startswith('metadata')]
    context = [c for c in res.column_names() if c.startswith('parameters')]

    # Unpack metrics if possible
    try:
        res = res.unpack('metric')
        metrics = [c for c in res.column_names() if c.startswith('metric')]
    except:
        metrics = ['metric']
        pass  # Do nothing

    metadatas.sort()
    context.sort()
    metrics.sort()
    res = res[metadatas + context + metrics]

    # Get aggregators for all metrics
    aggs = {}
    for m in metrics:
        aggs['mean_' + m] = _agg.MEAN(m)
    for m in metadatas:
        aggs[m] = _agg.CONCAT(m)
    aggs['num_folds'] = _agg.COUNT()

    res = res.groupby(context, aggs)

    # Clean up column names
    for s in ['parameters.', 'metric.', 'metadata.']:
        res = res.rename({c: c.replace(s, '') for c in res.column_names()})

    return res
Beispiel #3
0
def weather_stations(sensor_path):
    """
    Get the weather stations and the amount of sensors in each station.
    """
    sensors = get_data_from_csv(sensor_path)  # '../data/mi_sensors/'
    st = sensors.groupby(key_columns='street',
                         operations={'sensors': agg.COUNT()})
    return st
Beispiel #4
0
def pollution_sens_stations(path_sensors):
    """
    Get the pollution stations and the amount of sensors in each station.
    """
    sen = get_data_from_csv(path_sensors)  # full path for single file
    st = sen.groupby(key_columns='street_name',
                     operations={'sensors': agg.COUNT()})
    return st
Beispiel #5
0
def create_user_features(df):
    # Getting User Mean df
    user_rating_mean_df = df[['user_id', 'rating']]
    user_rating_mean_df = user_rating_mean_df.groupby(
        key_columns='user_id',
        operations={
            'mean_rating': agg.MEAN('rating'),
            'std_rating': agg.STD('rating'),
            'distinct_rating': agg.COUNT_DISTINCT('rating'),
            'count': agg.COUNT('rating')
        })
    user_features = gl.SFrame(user_rating_mean_df)
    return user_features
Beispiel #6
0
def create_item_features(df):
    # Getting Item mean df
    item_rating_mean_df = df[['business_id', 'rating']]
    item_rating_mean_df = item_rating_mean_df.groupby(
        key_columns='business_id',
        operations={
            'mean_rating': agg.MEAN('rating'),
            'std_rating': agg.STD('rating'),
            'distinct_rating': agg.COUNT_DISTINCT('rating'),
            'count': agg.COUNT('rating')
        })
    item_features = gl.SFrame(item_rating_mean_df)
    return item_features
Beispiel #7
0
def LPA(g, src):
    #group the neighbors according to the labels

    #count the labels
    #start=time.time()
    #src_max =
    #nseSrc = g.get_edges(src_ids=[src])
    #nseDst = g.get_edges(dst_ids=[src])
    nseOr = g.get_edges()
    #print(nseOr)
    merge = nseOr[(nseOr['__src_id'] == src) | (nseOr['__dst_id'] == src)]
    #print(merge)#merge = nseSrc.append(nseDst)
    #print(merge['__src_id'])
    #print(merge[(merge['__src_id'] != src) & (merge['__dst_id'] != src)])# or merge['__dst_id'] != src]

    #print(merge)
    merge2 = merge[merge['__src_id'] != src]
    #print(merge2)
    arr1 = merge2['__src_id']
    #print(arr1)
    #print(merge)
    merge3 = merge[merge['__dst_id'] != src]
    arr2 = merge3['__dst_id']
    #print(arr2)
    arr1 = arr1.append(arr2)
    #print(arr1)
    nsv = g.get_vertices(ids=arr1)

    #print(nsv)
    l_count = nsv.groupby(key_columns='label',
                          operations={'count': aggregate.COUNT()})
    #print(l_count)
    maxim = l_count['count'].max()
    #print(maxim)
    max_labels = l_count[l_count['count'] == maxim]['label']

    #print(len(max_labels))

    #end=time.time()
    #print(end-start)
    #print(max_labels)
    #print(max_labels)
    #start=time.time()

    random_label = rn.sample(range(0, len(max_labels)), 1)

    #print(max_labels[random_label[0]])
    #if s.get_vertices
    return max_labels[random_label[0]]
Beispiel #8
0
def count(g):
    gf = g.vertices
    l_count = gf.groupby(key_columns='label',
                         operations={'count': aggregate.COUNT()})
    return l_count
Beispiel #9
0
    def __init__(self, user_data='', item_data='', rating_data=''):
        if user_data:
            self._users = gl.SFrame.read_json(user_data, orient='records')
            self._users = self._users.unique()
        if item_data:
            self._items = gl.SFrame.read_json(item_data, orient='records')
            self._items = self._items.unique()
        if rating_data:
            self._ratings = gl.SFrame.read_json(rating_data, orient='records')
            self._ratings = self._ratings.unique()
            
            reviewsPerUser = self._ratings.groupby(key_columns='user_id', operations={'qtd': agg.COUNT()})
            topReviewers = reviewsPerUser[reviewsPerUser['qtd'] >= 5]
            topReviewers['marker'] = 1
            joined = self._ratings.join(topReviewers, on='user_id', how='left')
            self._ratings = joined[joined['marker'] == 1]
            self._ratings.remove_column('marker')
            self._ratings.remove_column('qtd')

            # Normalize ratings
            maxValue = max(self._ratings['rating'])
            range = 1.0/maxValue
            print("Max value!!!!!!!!!")
            print(maxValue)
            self._ratings['rating'] = self._ratings['rating'].apply(lambda x: x * range)
            self._train, self._test = gl.recommender.util.random_split_by_user(self._ratings)
            print("deug!!!!!!!!!")
            print(self._train)
            print("test!!!!!!!!!")
            print(self._test)
Beispiel #10
0
# g2 = gn['genre2'].unique()
# g3 = gn['genre3'].unique()

genres = {
    'genre1': gn['genre1'].unique(),
    'genre2': gn['genre2'].unique(),
    'genre3': gn['genre3'].unique()
}

joined = scrobbles.join(gn, on='songID')

joined['ts'] = joined['ts'].apply(lambda x: parse(x))

ts = gl.TimeSeries(joined, index='ts')
ts.save('ts')

total_listens = ts.resample(dt.timedelta(days=1), agg.COUNT())
total_listens.save(rootdir + '_total_listens')

for level in ('genre1', 'genre2', 'genre3'):
    n = len(genres[level])
    for i, genre in enumerate(genres[level]):

        current = ts[ts[level] == genre].resample(dt.timedelta(days=1),
                                                  agg.COUNT())
        #current.save(rootdir+level+'_'+genre)
        current.to_sframe().to_dataframe().to_pickle(rootdir + level + '_' +
                                                     genre.replace('/', '-') +
                                                     '.pkl')
        print "{} - {}  ({}/{})".format(level, genre, i + 1, n)
Beispiel #11
0
def tweeting_language_popularity(sf):
    """Get language popularity."""
    return sf.groupby('language', operations={'tweets': agg.COUNT()}
                      ).sort('tweets', ascending=False)
    itera += 1
    #
    if init:
        #first call
        sf = gl.SFrame.read_csv(file, verbose=False)
        init = False
    else:
        sf = sf.append(gl.SFrame.read_csv(file, verbose=False))
    #
    print(itera, " File : ", file, "| Size > ", sf.num_cols(), sf.num_rows())


# In[26]:

import graphlab.aggregate as agg
ngram_count = sf.groupby(key_columns='kl', operations={'numberofcalls': agg.COUNT()})


# In[27]:

# ngram_count = ngram_count.sort('numberofcalls', ascending = False)


# In[28]:

def plotsf(sf, thetitle):
    import numpy as np
    import matplotlib.pyplot as plt
    
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(15.5, 10.5)
Beispiel #13
0
@author: Ananthu
"""

import graphlab
import graphlab.aggregate as agg
import pandas as pd

image_train = graphlab.SFrame('Course1/Week6/image_train_data/')
image_test = graphlab.SFrame('Course1/Week6/image_test_data/')

image_train.head()
image_test.head()

### Qn 1

image_train.groupby('label', operations={'count': agg.COUNT('label')})

### Qn 2 & 3

image_train_bird = image_train[image_train['label'] == 'bird']
image_train_dog = image_train[image_train['label'] == 'dog']
image_train_cat = image_train[image_train['label'] == 'cat']
image_train_auto = image_train[image_train['label'] == 'automobile']

knn_model_bird = graphlab.nearest_neighbors.create(image_train_bird,
                                                   features=['deep_features'],
                                                   label='id')
knn_model_dog = graphlab.nearest_neighbors.create(image_train_dog,
                                                  features=['deep_features'],
                                                  label='id')
knn_model_cat = graphlab.nearest_neighbors.create(image_train_cat,
# Dumping Data into csv

df_text.to_csv("df_text.csv")

# Dropping dataframe from memory

df_text.drop(df_text.index, inplace=True)

# Importing saved csv file in graphlab sframe

sf_text = gl.SFrame.read_csv("df_text.csv")

# Obtaining the tweet count for each user

user_count = sf_text.groupby(key_columns='user_id',operations={'count': agg.COUNT()})


# Selecting users with more than 10 tweets for processing

user_count = user_count[user_count['count'] > 10]

sf_text.remove_column('X1')

# Obtaining the tweets for only selected users
final_sf=sf_text.join(user_count,on='user_id', how='inner')

final_sf.remove_column('count')

# saving data for processing in spark
    itera += 1
    #
    if init:
        #first call
        sf = gl.SFrame.read_csv(file, verbose=False)
        init = False
    else:
        sf = sf.append(gl.SFrame.read_csv(file, verbose=False))
    #
    print(itera, " File : ", file, "| Size > ", sf.num_cols(), sf.num_rows())

# In[26]:

import graphlab.aggregate as agg
ngram_count = sf.groupby(key_columns='kl',
                         operations={'numberofcalls': agg.COUNT()})

# In[27]:

# ngram_count = ngram_count.sort('numberofcalls', ascending = False)

# In[28]:


def plotsf(sf, thetitle):
    import numpy as np
    import matplotlib.pyplot as plt

    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(15.5, 10.5)
Beispiel #16
0
print("{}: {} = {} {} = {}.".format("QUESTION 2",
                                    "\nArtist with high play count",
                                    artist_playcnt_high,
                                    "\nArtist with low play count",
                                    artist_playcnt_low))

##======= Question 3 =======##
train_data, test_data = sf.random_split(0.8, seed=0)

# create similarity model
sim_model = gl.item_similarity_recommender.create(train_data,
                                                  user_id='user_id',
                                                  item_id='song')
# subset users
subset_test_users = test_data['user_id'].unique()[0:10000]

# recommend a song to each 10k users
subset_song_rec = sim_model.recommend(users=subset_test_users, k=1)
song_pred_cnt = subset_song_rec.groupby(key_columns='song',
                                        operations={'rec_count': agg.COUNT()})

most_rec_song = song_pred_cnt.sort('rec_count', ascending=False)['song'][0]
least_rec_song = song_pred_cnt.sort('rec_count', ascending=False)['song'][-1]

print("{}: {} = {} {} = {}.".format("QUESTION 3",
                                    "\nMost recommend song to top 10k users",
                                    most_rec_song,
                                    "\nLeast recommend song to top 10k user",
                                    least_rec_song))
Beispiel #17
0
scatter_matrix(df_reader, alpha=0.05, figsize=(15, 15));
#df_MV=pd.read_csv('pivotids.csv',delimiter="|")
#df_CQ=pd.read_csv('pivotids_CHECKQUANT.csv',delimiter="|")
#pd.merge(tablo,df_full, how='inner', on=['DTRANSACT', 'SKLAD','NOMGR'],suffixes=('_left', '_right'))
#df_Quant=pd.read_csv('pivotids_QUANT.csv',delimiter="|")
pd.merge(df_CQ,df_MV, how='outer', on=['RECENCY', 'MINDATE','CUR_AGE','PHONE_ids','Response'],left_on=)



import graphlab as gl
read_file=gl.SFrame.read_csv('pivotids_input.csv',delimiter='|')
#read_file=read_file.to_dataframe()


import graphlab.aggregate as agg
neww=read_file.groupby(key_columns='PHONE_ids', operations={'count': agg.COUNT()})
neww[neww['count'] > 1] 

read_file.remove_column('MONETARY_VALUE_SEGM_TOVARKATEG_ENG_NOMGR')
read_file.remove_column('CHECKQUANT_SEGM_TOVARKATEG_ENG_NOMGR')
read_file.remove_column('QUANT_SEGM_TOVARKATEG_ENG_NOMGR')
read_file.remove_column('QUANT_')
read_file.remove_column('CHECKQUANT_')
read_file.remove_column('MONETARY_VALUE_')
#read_file=read_file.iloc[:,:-2].replace(np.nan, 0)


"','".join(read_file.column_names())

user_schema = {
    'conversion_status': 'Response',