tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

ratings = pd.DataFrame(df.groupby('user_id')['score'].mean())

# Convert the pandas dataframes to graph lab SFrames
train_data = graphlab.SFrame(train_data_df)
test_data = graphlab.SFrame(test_data_df)

# Train the model
collab_filter_model = graphlab.item_similarity_recommender.create(
    train_data,
    user_id='user_id',
    item_id='prof_id',
    target=['subject1', 'subject2'],
    similarity_type='score')

# Make recommendations
how_many_recommendations = 10
item_recomendation = collab_filter_model.recommend(users=which_user_ids,
                                                   k=how_many_recommendations)
Exemple #2
0

train = pd.concat([
    get_dataframe(batch1),
    get_dataframe(batch2),
    get_dataframe(batch3),
    get_dataframe(batch4),
    get_dataframe(batch5)
],
                  ignore_index=True)
test = get_dataframe(batch_test)

print train.head()
print train.shape, test.shape

gltrain = gl.SFrame(train)
gltest = gl.SFrame(test)
model = gl.neuralnet_classifier.create(gltrain,
                                       target="label",
                                       validation_set=None)
model.evaluate(gltest)

gltrain['glimage'] = gl.SArray(gltrain['image']).pixel_array_to_image(
    32, 32, 3, allow_rounding=True)
gltest['glimage'] = gl.SArray(gltest['image']).pixel_array_to_image(
    32, 32, 3, allow_rounding=True)

gltrain.remove_column("image")
gltest.remove_column("image")
gltrain.head()
Exemple #3
0
import pandas as pd
import graphlab as gl

orderData = pd.read_csv("Data/orders.csv")
orderProductData = pd.read_csv("Data/order_products__train.csv")
actions = pd.merge(orderData, orderProductData, on="order_id", how="outer")

actionsTraining = actions[actions["eval_set"] == "train"]
actionsTraining = actionsTraining.dropna(subset=['product_id'])
actionsTraining = actionsTraining[["user_id", "product_id"]]
actionsTraining.columns = ['user_id', 'item_id']
actionsTraining["item_id"] = actionsTraining["item_id"].astype(int)
sf = gl.SFrame(actionsTraining)

#Item to Item similarity recommender
item_similarity_recommender = gl.recommender.item_similarity_recommender.create(
    sf)

#Top K recommendations for the model
#here we calculate the top k recommendation for each user.
k = 5
item_similarity_top_k = item_similarity_recommender.recommend(k=k)

#print the recommendation for the first user.
print(item_similarity_top_k[item_similarity_top_k['user_id'] == 1])

#save the model in file.
item_similarity_top_k.save('Data/item_similarity_top_5_model')
Exemple #4
0

@app.route('/choose-hike', methods=['GET', 'POST'])
def enter_hike():
    hikes = list_hikes(sf)
    regions = list_regions(sf)
    return render_template('choose-hike.html', hikes=hikes, regions=regions)


@app.route('/make-recommendations', methods=['POST', 'GET'])
def get_recommendations():
    hike = request.form.get('hike-name')
    region = request.form.get('region-name')
    miles = request.form.get('num-miles')
    elevation = request.form.get('elevation-gain')
    dog = request.form.get('dog')
    recs = model.recommend_from_interactions([hike], k=5)
    your_hike = get_info(hike)
    hike_data = get_hike_info(recs)
    return render_template('make-recommendations.html',
                           your_hike=your_hike,
                           hike_data=hike_data)


if __name__ == '__main__':
    df = pd.read_csv('../data/final_with_url.csv')
    df = clean_df(df)
    sf = gl.SFrame(df)
    model = gl.load_model('content_recommender')
    app.run(host='0.0.0.0', port=7070, debug=True)
import pandas as pd
import numpy as np
import graphlab as gl

if __name__ == "__main__":
    sample_sub_fname = "data/sample_submission.csv"
    ratings_data_fname = "data/ratings.dat"
    output_fname = "data/our_test_ratings.csv"

    ratings = gl.SFrame(ratings_data_fname, format='tsv')
    sample_sub = pd.read_csv(sample_sub_fname)
    for_prediction = gl.SFrame(sample_sub)
    rec_engine = gl.ranking_factorization_recommender.create(
        observation_data=ratings,
        user_id="user_id",
        item_id="joke_id",
        target='rating',
        solver='auto')

    sample_sub.rating = rec_engine.predict(
        for_prediction)  #update with ratings
    sample_sub.to_csv(output_fname, index=False)
Exemple #6
0
def boostedTrees(train,
                 labels,
                 test,
                 column_names=None,
                 target='target',
                 max_iterations=200,
                 min_child_weight=5,
                 step_size=0.2,
                 max_depth=10,
                 class_weights=None,
                 min_loss_reduction=0.5,
                 verbose=0,
                 outlier_frac=0.0,
                 outlier_method='EE',
                 rescale_pred=False):
    """
    train, labels, test are numpy matrices containing tha data 
    column_names is a list of column names of the test/train data
    target is the column name of the labels column
    Because it's graphlab and not sklearn, the calibration is not implemented (it's possible, but harder)
    Also, seemingly, setting sample weights is also not supported by graphlab
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method,
                                    use_caching=False)  # remove ourliers
    if column_names is None:
        column_names = range(np.shape(train)[1])
    target = 'target'
    newTrain = np.vstack((train.T, labels)).T
    pdTrain = pd.DataFrame(newTrain, columns=np.append(column_names, target))
    trainFrame = gl.SFrame(pdTrain)
    del newTrain, pdTrain
    pdTest = pd.DataFrame(test, columns=column_names)
    testFrame = gl.SFrame(pdTest)
    del pdTest
    model = gl.boosted_trees_classifier.create(
        trainFrame,
        target=target,
        max_iterations=max_iterations,
        min_child_weight=min_child_weight,
        step_size=step_size,
        max_depth=max_depth,
        class_weights=class_weights,
        min_loss_reduction=min_loss_reduction,
        verbose=verbose)
    preds = model.predict_topk(testFrame, output_type='probability', k=9)
    preds['id'] = preds['id'].astype(int)
    #some hacky dataframe magic, creates Nx10 matrix (id in first column)
    preds = preds.unstack(['class', 'probability'],
                          'probs').unpack('probs', '').sort('id')

    newPreds = preds.to_dataframe().values
    newPreds = newPreds[:, 1:]  #remove the id column
    del preds, model

    assert np.shape(newPreds)[0] == np.shape(
        test)[0], "conversion failed somewhere, size doesn't match"

    if rescale_pred:
        newPreds = rescale_prior(newPreds, np.bincount(labels))
    return newPreds
Exemple #7
0
# coding: utf-8

# In[ ]:

import graphlab as gl
cdr1=gl.SFrame.read_csv("../data/01_new.csv")


##THe CSV file doesnt have column names so this code adds another row to the csv to give it column names
sf=gl.SFrame({'1': [1] ,'10792228': [10792228] ,'18154720': [18154720] ,'2013-03-01 21:04:55' : ['2013-03-01 21:04:55'],'124': [124] ,'2731712': [2731712] ,'508306': [508306] ,'101': [101] , '101.1': [101], '0' : [0]})

cdr1.append(sf)
cdr1.rename({'1': 'Call_Type' ,'10792228': 'Caller' ,'18154720': 'Callee' ,'2013-03-01 21:04:55' : 'timestamp','124': 'Duration' ,'2731712': 'X6' ,'508306': 'X7' ,'101': 'TowerID_start' , '101.1': 'TowerID_end', '0' : 'Call_Status'})

cdr1.remove_columns(['X6','X7'])

cdr1.remove_column('Call_Status')

##to split date-time into days hours minutes and seconds
cdr1['date-time']=cdr1['timestamp'].str_to_datetime('%Y-%m-%d %H:%M:%S')
cdr1=cdr1.split_datetime('date-time',limit=['day','hour','minute','second'])
#cdr1.remove_column('X4')

cdr1.rename({'date-time.hour':'call hour','date-time.minute':'call minute','date-time.second':'call seconds','date-time.day':'day'})


cdr1.save("cdr1_churn.csv", format="csv")


#cdr2_6=gl.SFrame.read_csv("../data/02_06_new.csv")
target_users = pd.read_csv("target_users.csv", delimiter='\t')
userss = target_users['user_id'].values

item_profiles = pd.read_csv("item_profile.csv", delimiter='\t')
ids = item_profiles['id'].values
items_inattivi = item_profiles[item_profiles['active_during_test'] ==
                               0]['id'].values

index_to_ids = dict(zip(range(ids.size), ids))
ids_to_index = {v: k for k, v in index_to_ids.items()}
index_to_uds = dict(zip(range(userss.size), userss))
uds_to_index = {v: k for k, v in index_to_uds.items()}

interactions = pd.read_csv('interactions.csv', delimiter="\t")
#interactions = interactions[interactions['interaction_type'] == 1]
observations = gl.SFrame(interactions)

model6 = gl.recommender.item_similarity_recommender.create(
    observations,
    user_id='user_id',
    item_id='item_id',
    target=None,
    user_data=None,
    item_data=None,
    similarity_type='cosine',
    only_top_k=1700,
    verbose=True)
R7 = sparse.lil_matrix((10000, 167956))

td = pd.read_csv('interactions.csv', delimiter="\t")
import time
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
get_ipython().magic(u'matplotlib inline')

'''Check GraphLab Create version'''
from distutils.version import StrictVersion
assert (StrictVersion(graphlab.version) >= StrictVersion('1.8.5')), 'GraphLab Create must be version 1.8.5 or later.'


# ## Load the Wikipedia dataset

# In[2]:

wiki = graphlab.SFrame('datasets/people_wiki.gl/')


# As we did in previous assignments, let's extract the TF-IDF features:

# In[3]:

wiki['tf_idf'] = graphlab.text_analytics.tf_idf(wiki['text'])


# To run k-means on this dataset, we should convert the data matrix into a sparse matrix.

# In[6]:

#DAF understanding
print wiki
Exemple #10
0
import os
import sys
import urllib2
import json
import graphlab as gl

tweetsURL = "http://smisc-api.jacobgreenleaf.com/tweets"
tweets = json.load(urllib2.urlopen(tweetsURL))

g = gl.SGraph()

dictTweets = dict()

for tweet in tweets:
    dictTweets.setdefault('user_id', []).append(tweet['user_id'])
    dictTweets.setdefault('text', []).append(tweet['text'])
    dictTweets.setdefault('created_at', []).append(tweet['created_at'])
    dictTweets.setdefault('id_str', []).append(tweet['id_str'])

gtable = gl.SFrame(dictTweets)
gtable.save('./tweetTable')
Exemple #11
0
print(items.shape)
print(items.head())

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ua.base',
                            sep='\t',
                            names=r_cols,
                            encoding='latin-1')
ratings_test = pd.read_csv('ua.test',
                           sep='\t',
                           names=r_cols,
                           encoding='latin-1')
ratings_train.shape, ratings_test.shape

train_data = graphlab.SFrame(ratings_train)
test_data = graphlab.SFrame(ratings_test)

popularity_model = graphlab.popularity_recommender.create(train_data,
                                                          user_id='user_id',
                                                          item_id='movie_id',
                                                          target='rating')

popularity_recomm = popularity_model.recommend(users=[1, 2, 3, 4, 5], k=5)
popularity_recomm.print_rows(num_rows=25)

# Training the model for cosine similarity
item_sim_model_cosine = graphlab.item_similarity_recommender.create(
    train_data,
    user_id='user_id',
    item_id='movie_id',
# coding: utf-8

# In[3]:

import graphlab

# In[4]:

sales = graphlab.SFrame('home_data.gl/')

# In[5]:

sales

# # Exploring the data for house sales

# In[7]:

graphlab.canvas.set_target("ipynb")
sales.show(view="Scatter Plot", x="sqft_living", y="price")

# # Create a simple regression model
train_data, test_data = sales.ran
# In[8]:

train_data, test_data = sales.random_split(.8, seed="0")

# # Build regression model

# In[9]:
Exemple #13
0
#!/usr/bin/env python2.7

import sys
sys.path.append("..") 

import graphlab
from regression import polynomial_sframe
from regression import get_residual_sum_of_squares

# ----------------------------------------------
# Polynomial regression, revisited
# ----------------------------------------------
print("*** Polynomial regression, revisited")
sales = graphlab.SFrame('kc_house_data.gl/')
sales = sales.sort(['sqft_living','price'])
training,testing = sales.random_split(.9,seed=1)

l2_small_penalty = 1e-5

# ----------------------------------------------
# Observe overfitting
# ----------------------------------------------
print("*** Observe overfitting")

def plot_data(data):    
    plt.plot(data['X1'],data['Y'],'k.')
    plt.xlabel('x')
    plt.ylabel('y')

def polynomial_features(data, deg):
    data_copy=data.copy()
    distances = {'JSD':JSD}
    out_file = d+'log_knn'

    with open(out_file,'a') as fout:
        for k in k_range:
            print 'K=%s' % k


            # comparison to previous model
            ddir = d+'knn_%s_%s_%s' % (distance,N,k)
            if os.path.exists(ddir):
                topN = gl.SArray(ddir)
            else:
                features = np.load(d+'features_'+str(k)+'.npy')
                if distance in ('cosine',):
                    features = gl.SFrame(features)
                    model=gl.nearest_neighbors.create(dataset=features,distance=distance)
                    result = model.query(features,k=N+1)
                    topN = result[['query_label','reference_label','rank']].unstack(('rank','reference_label'),new_column_name='knn').sort('query_label').apply(lambda row: [row['knn'][i]  for i in xrange(1,N+2) if row['knn'][i]!=row['query_label']])
                else:
                    #neigh = NearestNeighbors(n_neighbors=N+1,algorithm='ball_tree',n_jobs=n_cores,metric='pyfunc',func=JSD)
                    #time neigh.fit(features)
                    #time topN = neigh.kneighbors(features[random_sample],return_distance=False)[:,1:]
                    #topN = gl.SArray(topN)

                    #%time topN = triple_apply_knn(features)

                    start = time.time()
                    all_dists = []
                    for i,a in enumerate(random_sample):
                        print "running sample %s/%s\r" % (i+1,sample_size)
Exemple #15
0
"""

from __future__ import print_function
import graphlab as gl
import graphlab.aggregate as agg
import os
os.chdir('c:/users/admin/documents/github/uow/uow_mlf/week 5/')


# GL Setup
pk = open('c:/users/admin/documents/github/gl_product_key.txt','r').read()
gl.product_key.set_product_key(pk)
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS',4)

# Load wikipedia people data
sf = gl.SFrame('song_data.gl/')

# Create users array
users = sf['user_id'].unique()


sf.head(4)
sf['song'].show()
lst_songs = sf['song'].unique()
n_songs = len(lst_songs)

# Split into train and test data without any thought to the class split...yeah.

train_data,test_data=sf.random_split(0.8,seed=0)

# Popularity model
import graphlab
import numpy as np
import matplotlib.pyplot as plt

sales = graphlab.SFrame('kc_house_data_small.gl/')

def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the 'features' list into the SFrame 'features_sframe'
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable 'output_sarray'
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    return (feature_matrix / norms,norms)

(train_and_validation, test) = sales.random_split(.8,seed=1)
(train,validation) = train_and_validation.random_split(.8,seed=1)

feature_list = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
# Getting Started with SFrames

import graphlab

sf = graphlab.SFrame('people-example.csv')
sf  # we can view first few lines of table
sf.tail()  # view end of the table

# .show() visualizes any data structure in GraphLab Create
sf.show()

# If you want Canvas visualization to show up on this notebook,
# rather than popping up a new window, add this line:
graphlab.canvas.set_target('ipynb')
sf['age'].show(view='Categorical')

# Inspect columns of dataset
sf['Country']
# dtype: str
# Rows: 7
# ['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland']
sf['age']
# dtype: int
# Rows: 7
# [24, 23, 22, 23, 23, 22, 25]

# Some simple columnar operations
sf['age'].mean()
sf['age'].max()

# Create new columns in our SFrame
    dataFolder, 't_base_weibo_user_fri_part000_9G.csv'),
                               header=False)
edegsData.rename({'X1': 'src', 'X2': 'dst'})

# In[12]:

#create graph
G = gl.SGraph()
G = G.add_edges(edges=edegsData, src_field='src', dst_field='dst')
print G.summary()

# In[17]:

#pagerank
pagerank_v = gl.pagerank.create(G, verbose=False)
print pagerank_v['training_time']

# In[18]:

node_pageValue = pagerank_v['pagerank']
node_pageValue.save(resultFolder + '/nodes_pagerank.csv', format='csv')

# In[9]:

sf = gl.SFrame()
sf2 = gl.SFrame([3, 5, 7, 8, 6, 9])
sf = sf2.append(sf)
sf

# In[ ]:
Exemple #19
0
import graphlab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
try:
    import seaborn
except ImportError:
    pass

from distutils.version import StrictVersion
assert (StrictVersion(graphlab.version) >= StrictVersion('1.8.5')
        ), 'GraphLab Create must be version 1.8.5 or later.'
wiki = graphlab.SFrame('C:\\Machine_Learning\\Cluster_wk2_1\\people_wiki.gl\\')
wiki
wiki['word_count'] = graphlab.text_analytics.count_words(wiki['text'])
model = graphlab.nearest_neighbors.create(wiki,
                                          label='name',
                                          features=['word_count'],
                                          method='brute_force',
                                          distance='euclidean')
bo = model.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=10)


def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_table = row[['word_count'
                            ]].stack('word_count',
                                     new_column_name=['word', 'count'])
Exemple #20
0
def create(observation_data,
           user_id='user_id',
           item_id='item_id',
           target=None,
           user_data=None,
           item_data=None,
           num_factors=32,
           regularization=1e-9,
           linear_regularization=1e-9,
           side_data_factorization=True,
           ranking_regularization=0.25,
           unobserved_rating_value=None,
           num_sampled_negative_examples=4,
           max_iterations=25,
           sgd_step_size=0,
           random_seed=0,
           binary_target=False,
           solver='auto',
           verbose=True,
           **kwargs):
    """Create a RankingFactorizationRecommender that learns latent factors for each
    user and item and uses them to make rating predictions.

    Parameters
    ----------
    observation_data : SFrame
        The dataset to use for training the model. It must contain a column of
        user ids and a column of item ids. Each row represents an observed
        interaction between the user and the item.  The (user, item) pairs
        are stored with the model so that they can later be excluded from
        recommendations if desired. It can optionally contain a target ratings
        column. All other columns are interpreted by the underlying model as
        side features for the observations.

        The user id and item id columns must be of type 'int' or 'str'. The
        target column must be of type 'int' or 'float'.

    user_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        user id.

    item_id : string, optional
        The name of the column in `observation_data` that corresponds to the
        item id.

    target : string, optional
        The `observation_data` can optionally contain a column of scores
        representing ratings given by the users. If present, the name of this
        column may be specified variables `target`.

    user_data : SFrame, optional
        Side information for the users.  This SFrame must have a column with
        the same name as what is specified by the `user_id` input parameter.
        `user_data` can provide any amount of additional user-specific
        information.

    item_data : SFrame, optional
        Side information for the items.  This SFrame must have a column with
        the same name as what is specified by the `item_id` input parameter.
        `item_data` can provide any amount of additional item-specific
        information.

    num_factors : int, optional
        Number of latent factors.

    regularization : float, optional
        L2 regularization for interaction terms. Default: 1e-10; a typical range
        for this parameter is between 1e-12 and 1. Setting this to 0 may cause
        numerical issues.

    linear_regularization : float, optional
        L2 regularization for linear term. Default: 1e-10; a typical range for this
        parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues.

    side_data_factorization : boolean, optional
        Use factorization for modeling any additional features beyond the user
        and item columns. If True, and side features or any additional columns are
        present, then a Factorization Machine model is trained. Otherwise, only
        the linear terms are fit to these features.  See
        :class:`graphlab.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`
        for more information. Default: True.

    ranking_regularization : float, optional
        Penalize the predicted value of user-item pairs not in the
        training set. Larger values increase this penalization.
        Suggeseted values: 0, 0.1, 0.5, 1.  NOTE: if no target column
        is present, this parameter is ignored.

    unobserved_rating_value : float, optional
        Penalize unobserved items with a larger predicted score than this value.
        By default, the estimated 5% quantile is used (mean - 1.96*std_dev).

    num_sampled_negative_examples : integer, optional
        For each (user, item) pair in the data, the ranking sgd solver evaluates
        this many randomly chosen unseen items for the negative example step.
        Increasing this can give better performance at the expense of speed,
        particularly when the number of items is large.  Default is 4.

    binary_target : boolean, optional
        Assume the target column is composed of 0's and 1's. If True, use
        logistic loss to fit the model.

    max_iterations : int, optional
        The training algorithm will make at most this many iterations through
        the observed data. Default: 50.

    sgd_step_size : float, optional
        Step size for stochastic gradient descent. Smaller values generally
        lead to more accurate models that take more time to train. The
        default setting of 0 means that the step size is chosen by trying
        several options on a small subset of the data.

    random_seed :  int, optional
        The random seed used to choose the initial starting point for
        model training. Note that some randomness in the training is
        unavoidable, so models trained with the same random seed may still
        differ. Default: 0.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. The available solvers for
        this model are:

        - *auto (default)*: automatically chooses the best solver for the data
                              and model parameters.
        - *ials*:           Implicit Alternating Least Squares [1].
        - *adagrad*:        Adaptive Gradient Stochastic Gradient Descent.
        - *sgd*:            Stochastic Gradient Descent

    verbose : bool, optional
        Enables verbose output.

    kwargs : optional
        Optional advanced keyword arguments passed in to the model
        optimization procedure. These parameters do not typically
        need to be changed. They, along with their default values, are
        given by get_default_options().

    Examples
    --------
    **Basic usage**

    When given just user and item pairs, one can create a RankingFactorizationRecommender
    as follows.

    >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"])
    >>> from graphlab.recommender import ranking_factorization_recommender
    >>> m1 = ranking_factorization_recommender.create(sf)

    When a target column is present, one can include this to try and recommend
    items that are rated highly.

    >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
    ...                       'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
    ...                       'rating': [1, 3, 2, 5, 4, 1, 4, 3]})

    >>> m1 = ranking_factorization_recommender.create(sf, target='rating')


    **Including side features**

    >>> user_info = graphlab.SFrame({'user_id': ["0", "1", "2"],
    ...                              'name': ["Alice", "Bob", "Charlie"],
    ...                              'numeric_feature': [0.1, 12, 22]})
    >>> item_info = graphlab.SFrame({'item_id': ["a", "b", "c", d"],
    ...                              'name': ["item1", "item2", "item3", "item4"],
    ...                              'dict_feature': [{'a' : 23}, {'a' : 13},
    ...                                               {'b' : 1},
    ...                                               {'a' : 23, 'b' : 32}]})
    >>> m2 = ranking_factorization_recommender.create(sf, target='rating',
    ...                                               user_data=user_info,
    ...                                               item_data=item_info)

    **Customizing ranking regularization**

    Create a model that pushes predicted ratings of unobserved user-item
    pairs toward 1 or below.

    >>> m3 = ranking_factorization_recommender.create(sf, target='rating',
    ...                                               ranking_regularization = 0.1,
    ...                                               unobserved_rating_value = 1)

    **Using the implicit alternating least squares model**

    Ranking factorization also implements implicit alternating least squares [1] as
    an alternative solver.  This is enable using ``solver = 'ials'``.

    >>> m3 = ranking_factorization_recommender.create(sf, target='rating',
                                                      solver = 'ials')

    See Also
    --------
    :class:`graphlab.recommender.factorization_recommender.FactorizationRecommender`,
    :class:`graphlab.recommender.ranking_factorization_recommender.RankingFactorizationRecommender`

    References
    -----------

    [1] Collaborative Filtering for Implicit Feedback Datasets Hu, Y.; Koren,
        Y.; Volinsky, C. IEEE International Conference on Data Mining
        (ICDM 2008), IEEE (2008).

    """

    _mt._get_metric_tracker().track(
        'toolkit.recsys.ranking_factorization_recommender.create')

    method = 'ranking_factorization_recommender'

    opts = {'model_name': method}
    response = _graphlab.toolkits._main.run("recsys_init", opts)
    model_proxy = response['model']

    if user_data is None:
        user_data = _graphlab.SFrame()
    if item_data is None:
        item_data = _graphlab.SFrame()

    if target is None:
        binary_target = True

    opts = {
        'dataset': observation_data,
        'user_id': user_id,
        'item_id': item_id,
        'target': target,
        'user_data': user_data,
        'item_data': item_data,
        'nearest_items': _graphlab.SFrame(),
        'model': model_proxy,
        'random_seed': random_seed,
        'num_factors': num_factors,
        'regularization': regularization,
        'linear_regularization': linear_regularization,
        'ranking_regularization': ranking_regularization,
        'binary_target': binary_target,
        'max_iterations': max_iterations,
        'side_data_factorization': side_data_factorization,
        'num_sampled_negative_examples': num_sampled_negative_examples,
        'solver': solver,

        # Has no effect here.
        # 'verbose'                 : verbose,
        'sgd_step_size': sgd_step_size
    }

    if unobserved_rating_value is not None:
        opts["unobserved_rating_value"] = unobserved_rating_value

    if kwargs:
        try:
            possible_args = set(get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ', '.join(bad_arguments))

        opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)

    return RankingFactorizationRecommender(response['model'])
# coding: utf-8

# # Week 6: Deep features for image classification & retrieval

# In[1]:

import graphlab as gl

# In[6]:

image_train = gl.SFrame('image_train_data/')

# In[28]:

image_test = gl.SFrame('image_test_data/')

# In[33]:

gl.canvas.set_target('ipynb')

# In[34]:

image_test[0:1]['image'].show()

# ## Creating category-specific image retrieval models

# In[19]:

image_train_bird = image_train[image_train['label'] == 'bird']

# In[20]:
print items.shape
items.head()

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base',
                           sep='\t',
                           names=r_cols,
                           encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test',
                           sep='\t',
                           names=r_cols,
                           encoding='latin-1')
ratings_base.shape, ratings_test.shape

import graphlab
train_data = graphlab.SFrame(ratings_base)
test_data = graphlab.SFrame(ratings_test)

popularity_model = graphlab.popularity_recommender.create(train_data,
                                                          user_id='user_id',
                                                          item_id='movie_id',
                                                          target='rating')

#Get recommendations for first 5 users and print them
#users = range(1,6) specifies user ID of first 5 users
#k=5 specifies top 5 recommendations to be given
popularity_recomm = popularity_model.recommend(users=range(1, 6), k=5)
popularity_recomm.print_rows(num_rows=25)

ratings_base.groupby(by='movie_id')['rating'].mean().sort_values(
    ascending=False).head(20)
Exemple #23
0
# Download data if you haven't already
import graphlab as gl
import os
'''
"Topic models" are a class of statistical models for text data.
These models typically assume documents can be described by a small set of topics,
and there is a probability of any word occurring for a given "topic".

'''
if os.path.exists('wikipedia_w0'):
    docs = gl.SFrame('wikipedia_w0')
else:
    docs = gl.SFrame.read_csv('https://static.turi.com/datasets/wikipedia/raw/w0.csv', header=False)
    docs.save('wikipedia_w0')

# Remove stopwords and convert to bag of words
docs = gl.text_analytics.count_words(docs['X1'])
docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)

# Learn topic model
model = gl.topic_model.create(docs)

'''
You may get details on a subset of topics by supplying a list of topic names or topic indices,
as well as restrict the number of words returned per topic.

'''
print model.get_topics()
'''
If we just want to see the top words per topic, this code snippet will rearrange the above SFrame to do that.
Exemple #24
0
"""

from __future__ import print_function
import graphlab as gl
import graphlab.aggregate as agg
import os
os.chdir('c:/users/admin/documents/github/uow/uow_mlf/week 4/')


# GL Setup
pk = open('c:/users/admin/documents/github/gl_product_key.txt','r').read()
gl.product_key.set_product_key(pk)
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS',4)

# Load wikipedia people data
sf = gl.SFrame('people_wiki.gl/')


# compute TF-IDF (term frequency / inverse document frequency) for the corpus (body of works)
sf['word_count'] = gl.text_analytics.count_words(sf['text'])

# Note: tfidf --> 0 for common words, and --> 1 for rare words
tfidf = gl.text_analytics.tf_idf(sf['word_count'])  # SFrame one-stop-shop
sf['tfidf'] = tfidf

##======= Question 1 =======##
# For Elton John, create word count and tfidf table and sorting on value descending
elton = sf[sf['name'] == 'Elton John']
elton_wc_tbl = elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort(['count'],ascending=False)
elton_tfidf_tbl = elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort(['tfidf'],ascending=False)
Exemple #25
0
    df2 = df[1]
    df3 = df1.append(df2,ignore_index = True)
    print '{}{}{}{}{}{}'.format("BIDROW length ",len(df1),' DF length ',len(df2),' DF3 ',len(df3))
    print "Creating CSV and Updating..."
    df3.to_csv(BASE_DATA_PATH+'/trainingset3.csv', sep=',', encoding='utf-8')
    print("CSV Done !!!")



    #########################################################################################################
    # Quick analysis using GraphLab classification library
    # Getting amazing result of 99.9
    #########################################################################################################

    #Shuffle the data https://dato.com/products/create/docs/generated/graphlab.toolkits.cross_validation.shuffle.html
    data = gl.SFrame('/Users/abhishekchoudhary/Work/python/facebook/trainingset3.csv')
    data = gl.cross_validation.shuffle(data)
    folds = gl.cross_validation.KFold(data, 5)
    for train, valid in folds:
        m = gl.boosted_trees_classifier.create(train, target='label')
        print m.evaluate(valid)

    # Get the i'th fold
    (train, test) = folds[4]
    #do a quick classification analysis on the dataset
    #https://dato.com/products/create/docs/graphlab.toolkits.classifier.html
    model = gl.classifier.create(train, target='outcome', features=['bidder_id', 'auction', 'merchandise','device','time','country','ip','url'])
    #https://dato.com/products/create/docs/generated/graphlab.toolkits.cross_validation.KFold.html#graphlab.toolkits.cross_validation.KFold

    #After above K-fold
    #https://dato.com/products/create/docs/generated/graphlab.boosted_trees_classifier.create.html
# -*- coding: utf-8 -*-
# @Author: GaNeShKuMaRm
# @Date:   2017-02-25 20:38:47
# @Last Modified by:   GaNeShKuMaRm
# @Last Modified time: 2017-02-25 20:51:27

import graphlab

baseURL = "Dataset_SFrame/image_data_with_deepfeatures/df-"
flag = True
for i in range(1, 4):
    temp_data = graphlab.SFrame(baseURL + str(i))
    if flag:
        image_data = temp_data
        flag = False
    else:
        image_data = image_data.append(temp_data)

image_data.save('Dataset_SFrame/image_data')
print "Completed!"
import sys

# gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/"))

model_path = "/data/hoytak/diabetic/models/models"

train_sf = []
test_sf = []
feature_names = []
each_sf_feature_names = []

# for n in [0, "1b", '2b', 4]:
for n in [0, 1, "1b", 2, '2b', 3, 4]:    
    try: 
        print "Loading %s" % str(n)
        Xf_train = gl.SFrame(model_path + "/scores_train_%s" % str(n))
        Xf_test = gl.SFrame(model_path + "/scores_test_%s" % str(n))

        sf_feature_names = []
        
        for fn in Xf_train.column_names():
            if fn.startswith("scores"):

                key = fn
                
                idx = 0
                while key in feature_names:
                    key = fn + ".%d" % idx
                    idx += 1

                if key != fn:
Exemple #28
0
import graphlab as gl

# Load the data
##data =  gl.SFrame('https://static.turi.com/datasets/regression/yelp-data.csv')
data = gl.SFrame('/Users/wei/code/pycharm/yelp-data.csv')

# Restaurants with rating >=3 are good
data['is_good'] = data['stars'] >= 3

# Make a train-test split
train_data, test_data = data.random_split(0.8)

# Automatically picks the right model based on your data.
model = gl.classifier.create(train_data,
                             target='is_good',
                             features=[
                                 'user_avg_stars', 'business_avg_stars',
                                 'user_review_count', 'business_review_count'
                             ])

# Generate predictions (class/probabilities etc.), contained in an SFrame.
predictions = model.classify(test_data)

# Evaluate the model, with the results stored in a dictionary
results = model.evaluate(test_data)
import graphlab


sales = graphlab.SFrame('D:\Tasks\projects\python\casestudy\course1\home_data.gl/')
train_data, test_data = sales.random_split(.8, seed=0)
sqft_model = graphlab.linear_regression.create(train_data)
print(train_data.head(5))
Exemple #30
0
    #a=dict(a,**z)
    try:
        del i['_id']
    except:
        pass
    z.append(i)
df = pd.DataFrame(data=z, dtype=object)
df['price'] = df['price'].astype(str)
df['name'] = df['name'].astype(str)
df['type'] = df['type'].astype(str)
df['description'] = df['description'].astype(str)
print(df.dtypes)
users = db.users.find({})
order = db.orders
user = []
menu = graphlab.SFrame(df)
menu['word_count'] = graphlab.text_analytics.count_words(menu['description'])
tfidf = graphlab.text_analytics.tf_idf(menu['word_count'])
menu['tfidf'] = tfidf
knn_model = graphlab.nearest_neighbors.create(menu,
                                              features=['tfidf'],
                                              label='name')
print type(users)
for i in users:
    user = i['email']
    print(user)
    orders = order.find({'email': user})
    items = []
    for i in orders:
        items += i['orders']
        print type(items)