Ejemplo n.º 1
0
def _progress_multi_combiner(results):
    res = results.unpack('metadata').unpack('parameters')
    metadatas = [c for c in res.column_names() if c.startswith('metadata')]
    context = [c for c in res.column_names() if c.startswith('parameters')]

    # Unpack metrics if possible
    try:
        res = res.unpack('metric')
        metrics = [c for c in res.column_names() if c.startswith('metric')]
    except:
        metrics = ['metric']
        pass  # Do nothing

    metadatas.sort()
    context.sort()
    metrics.sort()
    res = res[metadatas + context + metrics]

    # Get aggregators for all metrics
    aggs = {}
    for m in metrics:
        aggs['mean_' + m] = _agg.MEAN(m)
    for m in metadatas:
        aggs[m] = _agg.CONCAT(m)
    aggs['num_folds'] = _agg.COUNT()

    res = res.groupby(context, aggs)

    # Clean up column names
    for s in ['parameters.', 'metric.', 'metadata.']:
        res = res.rename({c: c.replace(s, '') for c in res.column_names()})

    return res
Ejemplo n.º 2
0
def aggreagate_res(data_folder, res_path):
    res = SFrame()
    for f in os.listdir(data_folder):
        temp_sf = SFrame.read_csv(data_folder + "\\" + f,
                                  column_type_hints={"prob": float})
        res = res.append(temp_sf)

    res = res.groupby("src_id", operations={"prob": aggregate.MEAN('prob')})

    # res["actual"] = res["actual"].apply(lambda x: "P" if x == 1 else "N")
    res.save(res_path, 'csv')
Ejemplo n.º 3
0
def create_item_features(df):
    # Getting Item mean df
    item_rating_mean_df = df[['business_id', 'rating']]
    item_rating_mean_df = item_rating_mean_df.groupby(
        key_columns='business_id',
        operations={
            'mean_rating': agg.MEAN('rating'),
            'std_rating': agg.STD('rating'),
            'distinct_rating': agg.COUNT_DISTINCT('rating'),
            'count': agg.COUNT('rating')
        })
    item_features = gl.SFrame(item_rating_mean_df)
    return item_features
Ejemplo n.º 4
0
def create_user_features(df):
    # Getting User Mean df
    user_rating_mean_df = df[['user_id', 'rating']]
    user_rating_mean_df = user_rating_mean_df.groupby(
        key_columns='user_id',
        operations={
            'mean_rating': agg.MEAN('rating'),
            'std_rating': agg.STD('rating'),
            'distinct_rating': agg.COUNT_DISTINCT('rating'),
            'count': agg.COUNT('rating')
        })
    user_features = gl.SFrame(user_rating_mean_df)
    return user_features
Ejemplo n.º 5
0
def agg_traffic_24_hours(sfdf):
    """
    Agg data according to recording hour.

    This means calculating the mean of the data over a given hour, e.g. 23:00
    taking the mean of all data recorded at 23:00 each day so that it is
    possible to compare traffic on per hour basis or the recording time.
    """
    opsv2 = {'SMSsIn': agg.MEAN('smsin_tot'),
             'SMSsout': agg.MEAN('smsout_tot'),
             'CallsIn': agg.MEAN('callin_tot'),
             'CallsOut': agg.MEAN('callout_tot'),
             'WebTraff': agg.MEAN('web_tot')
             }

    sfdf['hour'] = sfdf['time'].apply(lambda x: dt.utcfromtimestamp(x/1e3).
                                      strftime('%H:%M:%S'))
    sf = gp.SFrame(data=sfdf[['callin_tot', 'callout_tot', 'smsin_tot',
                   'web_tot', 'smsout_tot', 'hour']])
    sf_grouped = sf.groupby('hour', operations=opsv2)
    sfdf = sf_grouped.sort('hour').to_dataframe().set_index('hour')
    sfdf.index = pd.to_datetime(sfdf.index)
    sfdf = sfdf.resample('H').sum()
    return np.log(sfdf.aggregate('sum', axis='columns'))
Ejemplo n.º 6
0
import graphlab as gl
import graphlab.aggregate as agg
data_in_path = "/home/warreee/projects/2016-SS-Assignments/Assignment2/Tableau/raw_data/"

data_out_path = "/home/warreee/projects/2016-SS-Assignments/Assignment2/Tableau/clean_data/"

agora = gl.SFrame.read_csv(data_in_path + "agora1.txt",
                           delimiter=' ',
                           header=False)
agora['X2'] = agora['X2'].apply(lambda x: x.replace(',', ''))
agora['X3'] = agora['X3'].apply(lambda x: ':'.join(x.split(':')[0:2]))
agora = agora.groupby(key_columns=['X3', 'X2'],
                      operations={
                          'average': agg.MEAN('X1')
                      }).sort(sort_columns=['X2', 'X3'])
agora.save(data_out_path + "agora.csv", format='csv')

biomedisch = gl.SFrame.read_csv(data_in_path + "biomedisch1.txt",
                                delimiter=' ',
                                header=False)
biomedisch['X2'] = biomedisch['X2'].apply(lambda x: x.replace(',', ''))
biomedisch['X3'] = biomedisch['X3'].apply(
    lambda x: ':'.join(x.split(':')[0:2]))
biomedisch = biomedisch.groupby(key_columns=['X3', 'X2'],
                                operations={
                                    'average': agg.MEAN('X1')
                                }).sort(sort_columns=['X2', 'X3'])
biomedisch.save(data_out_path + "biomedisch.csv", format='csv')

cba = gl.SFrame.read_csv(data_in_path + "cba1.txt",
                         delimiter=' ',
Ejemplo n.º 7
0
Spyder Editor

This is a temporary script file.
"""

import graphlab
import graphlab.aggregate as agg

sales = graphlab.SFrame('Course1/Week2/home_data.gl/')

sales.head()

### Qn 1
### Zipcode with highest avg house price

df = sales.groupby(key_columns='zipcode', operations={'price': agg.MEAN('price')})

df = df.sort(['price']) 
df.tail() ### zipcode - 98039 & price - 2160606.6

### Qn 2

sales.shape ### 21613 rows & 21 columns

df = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]

df.shape ### 9111 rows & 21 columns

fraction = 9111.0/21613.0

### Qn 3