tfidf = TfidfVectorizer(stop_words='english') #Replace NaN with an empty string metadata['overview'] = metadata['overview'].fillna('') #Construct the required TF-IDF matrix by fitting and transforming the data tfidf_matrix = tfidf.fit_transform(metadata['overview']) #Output the shape of tfidf_matrix tfidf_matrix.shape ratings = pd.DataFrame(df.groupby('user_id')['score'].mean()) # Convert the pandas dataframes to graph lab SFrames train_data = graphlab.SFrame(train_data_df) test_data = graphlab.SFrame(test_data_df) # Train the model collab_filter_model = graphlab.item_similarity_recommender.create( train_data, user_id='user_id', item_id='prof_id', target=['subject1', 'subject2'], similarity_type='score') # Make recommendations how_many_recommendations = 10 item_recomendation = collab_filter_model.recommend(users=which_user_ids, k=how_many_recommendations)
train = pd.concat([ get_dataframe(batch1), get_dataframe(batch2), get_dataframe(batch3), get_dataframe(batch4), get_dataframe(batch5) ], ignore_index=True) test = get_dataframe(batch_test) print train.head() print train.shape, test.shape gltrain = gl.SFrame(train) gltest = gl.SFrame(test) model = gl.neuralnet_classifier.create(gltrain, target="label", validation_set=None) model.evaluate(gltest) gltrain['glimage'] = gl.SArray(gltrain['image']).pixel_array_to_image( 32, 32, 3, allow_rounding=True) gltest['glimage'] = gl.SArray(gltest['image']).pixel_array_to_image( 32, 32, 3, allow_rounding=True) gltrain.remove_column("image") gltest.remove_column("image") gltrain.head()
import pandas as pd import graphlab as gl orderData = pd.read_csv("Data/orders.csv") orderProductData = pd.read_csv("Data/order_products__train.csv") actions = pd.merge(orderData, orderProductData, on="order_id", how="outer") actionsTraining = actions[actions["eval_set"] == "train"] actionsTraining = actionsTraining.dropna(subset=['product_id']) actionsTraining = actionsTraining[["user_id", "product_id"]] actionsTraining.columns = ['user_id', 'item_id'] actionsTraining["item_id"] = actionsTraining["item_id"].astype(int) sf = gl.SFrame(actionsTraining) #Item to Item similarity recommender item_similarity_recommender = gl.recommender.item_similarity_recommender.create( sf) #Top K recommendations for the model #here we calculate the top k recommendation for each user. k = 5 item_similarity_top_k = item_similarity_recommender.recommend(k=k) #print the recommendation for the first user. print(item_similarity_top_k[item_similarity_top_k['user_id'] == 1]) #save the model in file. item_similarity_top_k.save('Data/item_similarity_top_5_model')
@app.route('/choose-hike', methods=['GET', 'POST']) def enter_hike(): hikes = list_hikes(sf) regions = list_regions(sf) return render_template('choose-hike.html', hikes=hikes, regions=regions) @app.route('/make-recommendations', methods=['POST', 'GET']) def get_recommendations(): hike = request.form.get('hike-name') region = request.form.get('region-name') miles = request.form.get('num-miles') elevation = request.form.get('elevation-gain') dog = request.form.get('dog') recs = model.recommend_from_interactions([hike], k=5) your_hike = get_info(hike) hike_data = get_hike_info(recs) return render_template('make-recommendations.html', your_hike=your_hike, hike_data=hike_data) if __name__ == '__main__': df = pd.read_csv('../data/final_with_url.csv') df = clean_df(df) sf = gl.SFrame(df) model = gl.load_model('content_recommender') app.run(host='0.0.0.0', port=7070, debug=True)
import pandas as pd import numpy as np import graphlab as gl if __name__ == "__main__": sample_sub_fname = "data/sample_submission.csv" ratings_data_fname = "data/ratings.dat" output_fname = "data/our_test_ratings.csv" ratings = gl.SFrame(ratings_data_fname, format='tsv') sample_sub = pd.read_csv(sample_sub_fname) for_prediction = gl.SFrame(sample_sub) rec_engine = gl.ranking_factorization_recommender.create( observation_data=ratings, user_id="user_id", item_id="joke_id", target='rating', solver='auto') sample_sub.rating = rec_engine.predict( for_prediction) #update with ratings sample_sub.to_csv(output_fname, index=False)
def boostedTrees(train, labels, test, column_names=None, target='target', max_iterations=200, min_child_weight=5, step_size=0.2, max_depth=10, class_weights=None, min_loss_reduction=0.5, verbose=0, outlier_frac=0.0, outlier_method='EE', rescale_pred=False): """ train, labels, test are numpy matrices containing tha data column_names is a list of column names of the test/train data target is the column name of the labels column Because it's graphlab and not sklearn, the calibration is not implemented (it's possible, but harder) Also, seemingly, setting sample weights is also not supported by graphlab """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method, use_caching=False) # remove ourliers if column_names is None: column_names = range(np.shape(train)[1]) target = 'target' newTrain = np.vstack((train.T, labels)).T pdTrain = pd.DataFrame(newTrain, columns=np.append(column_names, target)) trainFrame = gl.SFrame(pdTrain) del newTrain, pdTrain pdTest = pd.DataFrame(test, columns=column_names) testFrame = gl.SFrame(pdTest) del pdTest model = gl.boosted_trees_classifier.create( trainFrame, target=target, max_iterations=max_iterations, min_child_weight=min_child_weight, step_size=step_size, max_depth=max_depth, class_weights=class_weights, min_loss_reduction=min_loss_reduction, verbose=verbose) preds = model.predict_topk(testFrame, output_type='probability', k=9) preds['id'] = preds['id'].astype(int) #some hacky dataframe magic, creates Nx10 matrix (id in first column) preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '').sort('id') newPreds = preds.to_dataframe().values newPreds = newPreds[:, 1:] #remove the id column del preds, model assert np.shape(newPreds)[0] == np.shape( test)[0], "conversion failed somewhere, size doesn't match" if rescale_pred: newPreds = rescale_prior(newPreds, np.bincount(labels)) return newPreds
# coding: utf-8 # In[ ]: import graphlab as gl cdr1=gl.SFrame.read_csv("../data/01_new.csv") ##THe CSV file doesnt have column names so this code adds another row to the csv to give it column names sf=gl.SFrame({'1': [1] ,'10792228': [10792228] ,'18154720': [18154720] ,'2013-03-01 21:04:55' : ['2013-03-01 21:04:55'],'124': [124] ,'2731712': [2731712] ,'508306': [508306] ,'101': [101] , '101.1': [101], '0' : [0]}) cdr1.append(sf) cdr1.rename({'1': 'Call_Type' ,'10792228': 'Caller' ,'18154720': 'Callee' ,'2013-03-01 21:04:55' : 'timestamp','124': 'Duration' ,'2731712': 'X6' ,'508306': 'X7' ,'101': 'TowerID_start' , '101.1': 'TowerID_end', '0' : 'Call_Status'}) cdr1.remove_columns(['X6','X7']) cdr1.remove_column('Call_Status') ##to split date-time into days hours minutes and seconds cdr1['date-time']=cdr1['timestamp'].str_to_datetime('%Y-%m-%d %H:%M:%S') cdr1=cdr1.split_datetime('date-time',limit=['day','hour','minute','second']) #cdr1.remove_column('X4') cdr1.rename({'date-time.hour':'call hour','date-time.minute':'call minute','date-time.second':'call seconds','date-time.day':'day'}) cdr1.save("cdr1_churn.csv", format="csv") #cdr2_6=gl.SFrame.read_csv("../data/02_06_new.csv")
target_users = pd.read_csv("target_users.csv", delimiter='\t') userss = target_users['user_id'].values item_profiles = pd.read_csv("item_profile.csv", delimiter='\t') ids = item_profiles['id'].values items_inattivi = item_profiles[item_profiles['active_during_test'] == 0]['id'].values index_to_ids = dict(zip(range(ids.size), ids)) ids_to_index = {v: k for k, v in index_to_ids.items()} index_to_uds = dict(zip(range(userss.size), userss)) uds_to_index = {v: k for k, v in index_to_uds.items()} interactions = pd.read_csv('interactions.csv', delimiter="\t") #interactions = interactions[interactions['interaction_type'] == 1] observations = gl.SFrame(interactions) model6 = gl.recommender.item_similarity_recommender.create( observations, user_id='user_id', item_id='item_id', target=None, user_data=None, item_data=None, similarity_type='cosine', only_top_k=1700, verbose=True) R7 = sparse.lil_matrix((10000, 167956)) td = pd.read_csv('interactions.csv', delimiter="\t")
import time from scipy.sparse import csr_matrix from sklearn.cluster import KMeans from sklearn.metrics import pairwise_distances get_ipython().magic(u'matplotlib inline') '''Check GraphLab Create version''' from distutils.version import StrictVersion assert (StrictVersion(graphlab.version) >= StrictVersion('1.8.5')), 'GraphLab Create must be version 1.8.5 or later.' # ## Load the Wikipedia dataset # In[2]: wiki = graphlab.SFrame('datasets/people_wiki.gl/') # As we did in previous assignments, let's extract the TF-IDF features: # In[3]: wiki['tf_idf'] = graphlab.text_analytics.tf_idf(wiki['text']) # To run k-means on this dataset, we should convert the data matrix into a sparse matrix. # In[6]: #DAF understanding print wiki
import os import sys import urllib2 import json import graphlab as gl tweetsURL = "http://smisc-api.jacobgreenleaf.com/tweets" tweets = json.load(urllib2.urlopen(tweetsURL)) g = gl.SGraph() dictTweets = dict() for tweet in tweets: dictTweets.setdefault('user_id', []).append(tweet['user_id']) dictTweets.setdefault('text', []).append(tweet['text']) dictTweets.setdefault('created_at', []).append(tweet['created_at']) dictTweets.setdefault('id_str', []).append(tweet['id_str']) gtable = gl.SFrame(dictTweets) gtable.save('./tweetTable')
print(items.shape) print(items.head()) r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings_train = pd.read_csv('ua.base', sep='\t', names=r_cols, encoding='latin-1') ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols, encoding='latin-1') ratings_train.shape, ratings_test.shape train_data = graphlab.SFrame(ratings_train) test_data = graphlab.SFrame(ratings_test) popularity_model = graphlab.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating') popularity_recomm = popularity_model.recommend(users=[1, 2, 3, 4, 5], k=5) popularity_recomm.print_rows(num_rows=25) # Training the model for cosine similarity item_sim_model_cosine = graphlab.item_similarity_recommender.create( train_data, user_id='user_id', item_id='movie_id',
# coding: utf-8 # In[3]: import graphlab # In[4]: sales = graphlab.SFrame('home_data.gl/') # In[5]: sales # # Exploring the data for house sales # In[7]: graphlab.canvas.set_target("ipynb") sales.show(view="Scatter Plot", x="sqft_living", y="price") # # Create a simple regression model train_data, test_data = sales.ran # In[8]: train_data, test_data = sales.random_split(.8, seed="0") # # Build regression model # In[9]:
#!/usr/bin/env python2.7 import sys sys.path.append("..") import graphlab from regression import polynomial_sframe from regression import get_residual_sum_of_squares # ---------------------------------------------- # Polynomial regression, revisited # ---------------------------------------------- print("*** Polynomial regression, revisited") sales = graphlab.SFrame('kc_house_data.gl/') sales = sales.sort(['sqft_living','price']) training,testing = sales.random_split(.9,seed=1) l2_small_penalty = 1e-5 # ---------------------------------------------- # Observe overfitting # ---------------------------------------------- print("*** Observe overfitting") def plot_data(data): plt.plot(data['X1'],data['Y'],'k.') plt.xlabel('x') plt.ylabel('y') def polynomial_features(data, deg): data_copy=data.copy()
distances = {'JSD':JSD} out_file = d+'log_knn' with open(out_file,'a') as fout: for k in k_range: print 'K=%s' % k # comparison to previous model ddir = d+'knn_%s_%s_%s' % (distance,N,k) if os.path.exists(ddir): topN = gl.SArray(ddir) else: features = np.load(d+'features_'+str(k)+'.npy') if distance in ('cosine',): features = gl.SFrame(features) model=gl.nearest_neighbors.create(dataset=features,distance=distance) result = model.query(features,k=N+1) topN = result[['query_label','reference_label','rank']].unstack(('rank','reference_label'),new_column_name='knn').sort('query_label').apply(lambda row: [row['knn'][i] for i in xrange(1,N+2) if row['knn'][i]!=row['query_label']]) else: #neigh = NearestNeighbors(n_neighbors=N+1,algorithm='ball_tree',n_jobs=n_cores,metric='pyfunc',func=JSD) #time neigh.fit(features) #time topN = neigh.kneighbors(features[random_sample],return_distance=False)[:,1:] #topN = gl.SArray(topN) #%time topN = triple_apply_knn(features) start = time.time() all_dists = [] for i,a in enumerate(random_sample): print "running sample %s/%s\r" % (i+1,sample_size)
""" from __future__ import print_function import graphlab as gl import graphlab.aggregate as agg import os os.chdir('c:/users/admin/documents/github/uow/uow_mlf/week 5/') # GL Setup pk = open('c:/users/admin/documents/github/gl_product_key.txt','r').read() gl.product_key.set_product_key(pk) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS',4) # Load wikipedia people data sf = gl.SFrame('song_data.gl/') # Create users array users = sf['user_id'].unique() sf.head(4) sf['song'].show() lst_songs = sf['song'].unique() n_songs = len(lst_songs) # Split into train and test data without any thought to the class split...yeah. train_data,test_data=sf.random_split(0.8,seed=0) # Popularity model
import graphlab import numpy as np import matplotlib.pyplot as plt sales = graphlab.SFrame('kc_house_data_small.gl/') def get_numpy_data(data_sframe, features, output): data_sframe['constant'] = 1 # add a constant column to an SFrame # prepend variable 'constant' to the features list features = ['constant'] + features # select the columns of data_SFrame given by the 'features' list into the SFrame 'features_sframe' features_sframe = data_sframe[features] # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!! features_matrix = features_sframe.to_numpy() # assign the column of data_sframe associated with the target to the variable 'output_sarray' output_sarray = data_sframe[output] # this will convert the SArray into a numpy array: output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!! return(features_matrix, output_array) def normalize_features(feature_matrix): norms = np.linalg.norm(feature_matrix, axis=0) return (feature_matrix / norms,norms) (train_and_validation, test) = sales.random_split(.8,seed=1) (train,validation) = train_and_validation.random_split(.8,seed=1) feature_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
# Getting Started with SFrames import graphlab sf = graphlab.SFrame('people-example.csv') sf # we can view first few lines of table sf.tail() # view end of the table # .show() visualizes any data structure in GraphLab Create sf.show() # If you want Canvas visualization to show up on this notebook, # rather than popping up a new window, add this line: graphlab.canvas.set_target('ipynb') sf['age'].show(view='Categorical') # Inspect columns of dataset sf['Country'] # dtype: str # Rows: 7 # ['United States', 'Canada', 'England', 'USA', 'Poland', 'United States', 'Switzerland'] sf['age'] # dtype: int # Rows: 7 # [24, 23, 22, 23, 23, 22, 25] # Some simple columnar operations sf['age'].mean() sf['age'].max() # Create new columns in our SFrame
dataFolder, 't_base_weibo_user_fri_part000_9G.csv'), header=False) edegsData.rename({'X1': 'src', 'X2': 'dst'}) # In[12]: #create graph G = gl.SGraph() G = G.add_edges(edges=edegsData, src_field='src', dst_field='dst') print G.summary() # In[17]: #pagerank pagerank_v = gl.pagerank.create(G, verbose=False) print pagerank_v['training_time'] # In[18]: node_pageValue = pagerank_v['pagerank'] node_pageValue.save(resultFolder + '/nodes_pagerank.csv', format='csv') # In[9]: sf = gl.SFrame() sf2 = gl.SFrame([3, 5, 7, 8, 6, 9]) sf = sf2.append(sf) sf # In[ ]:
import graphlab import numpy as np import pandas as pd import matplotlib.pyplot as plt try: import seaborn except ImportError: pass from distutils.version import StrictVersion assert (StrictVersion(graphlab.version) >= StrictVersion('1.8.5') ), 'GraphLab Create must be version 1.8.5 or later.' wiki = graphlab.SFrame('C:\\Machine_Learning\\Cluster_wk2_1\\people_wiki.gl\\') wiki wiki['word_count'] = graphlab.text_analytics.count_words(wiki['text']) model = graphlab.nearest_neighbors.create(wiki, label='name', features=['word_count'], method='brute_force', distance='euclidean') bo = model.query(wiki[wiki['name'] == 'Barack Obama'], label='name', k=10) def top_words(name): """ Get a table of the most frequent words in the given person's wikipedia page. """ row = wiki[wiki['name'] == name] word_count_table = row[['word_count' ]].stack('word_count', new_column_name=['word', 'count'])
def create(observation_data, user_id='user_id', item_id='item_id', target=None, user_data=None, item_data=None, num_factors=32, regularization=1e-9, linear_regularization=1e-9, side_data_factorization=True, ranking_regularization=0.25, unobserved_rating_value=None, num_sampled_negative_examples=4, max_iterations=25, sgd_step_size=0, random_seed=0, binary_target=False, solver='auto', verbose=True, **kwargs): """Create a RankingFactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids and a column of item ids. Each row represents an observed interaction between the user and the item. The (user, item) pairs are stored with the model so that they can later be excluded from recommendations if desired. It can optionally contain a target ratings column. All other columns are interpreted by the underlying model as side features for the observations. The user id and item id columns must be of type 'int' or 'str'. The target column must be of type 'int' or 'float'. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. item_id : string, optional The name of the column in `observation_data` that corresponds to the item id. target : string, optional The `observation_data` can optionally contain a column of scores representing ratings given by the users. If present, the name of this column may be specified variables `target`. user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. item_data : SFrame, optional Side information for the items. This SFrame must have a column with the same name as what is specified by the `item_id` input parameter. `item_data` can provide any amount of additional item-specific information. num_factors : int, optional Number of latent factors. regularization : float, optional L2 regularization for interaction terms. Default: 1e-10; a typical range for this parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues. linear_regularization : float, optional L2 regularization for linear term. Default: 1e-10; a typical range for this parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues. side_data_factorization : boolean, optional Use factorization for modeling any additional features beyond the user and item columns. If True, and side features or any additional columns are present, then a Factorization Machine model is trained. Otherwise, only the linear terms are fit to these features. See :class:`graphlab.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` for more information. Default: True. ranking_regularization : float, optional Penalize the predicted value of user-item pairs not in the training set. Larger values increase this penalization. Suggeseted values: 0, 0.1, 0.5, 1. NOTE: if no target column is present, this parameter is ignored. unobserved_rating_value : float, optional Penalize unobserved items with a larger predicted score than this value. By default, the estimated 5% quantile is used (mean - 1.96*std_dev). num_sampled_negative_examples : integer, optional For each (user, item) pair in the data, the ranking sgd solver evaluates this many randomly chosen unseen items for the negative example step. Increasing this can give better performance at the expense of speed, particularly when the number of items is large. Default is 4. binary_target : boolean, optional Assume the target column is composed of 0's and 1's. If True, use logistic loss to fit the model. max_iterations : int, optional The training algorithm will make at most this many iterations through the observed data. Default: 50. sgd_step_size : float, optional Step size for stochastic gradient descent. Smaller values generally lead to more accurate models that take more time to train. The default setting of 0 means that the step size is chosen by trying several options on a small subset of the data. random_seed : int, optional The random seed used to choose the initial starting point for model training. Note that some randomness in the training is unavoidable, so models trained with the same random seed may still differ. Default: 0. solver : string, optional Name of the solver to be used to solve the regression. See the references for more detail on each solver. The available solvers for this model are: - *auto (default)*: automatically chooses the best solver for the data and model parameters. - *ials*: Implicit Alternating Least Squares [1]. - *adagrad*: Adaptive Gradient Stochastic Gradient Descent. - *sgd*: Stochastic Gradient Descent verbose : bool, optional Enables verbose output. kwargs : optional Optional advanced keyword arguments passed in to the model optimization procedure. These parameters do not typically need to be changed. They, along with their default values, are given by get_default_options(). Examples -------- **Basic usage** When given just user and item pairs, one can create a RankingFactorizationRecommender as follows. >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"]) >>> from graphlab.recommender import ranking_factorization_recommender >>> m1 = ranking_factorization_recommender.create(sf) When a target column is present, one can include this to try and recommend items that are rated highly. >>> sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], ... 'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) >>> m1 = ranking_factorization_recommender.create(sf, target='rating') **Including side features** >>> user_info = graphlab.SFrame({'user_id': ["0", "1", "2"], ... 'name': ["Alice", "Bob", "Charlie"], ... 'numeric_feature': [0.1, 12, 22]}) >>> item_info = graphlab.SFrame({'item_id': ["a", "b", "c", d"], ... 'name': ["item1", "item2", "item3", "item4"], ... 'dict_feature': [{'a' : 23}, {'a' : 13}, ... {'b' : 1}, ... {'a' : 23, 'b' : 32}]}) >>> m2 = ranking_factorization_recommender.create(sf, target='rating', ... user_data=user_info, ... item_data=item_info) **Customizing ranking regularization** Create a model that pushes predicted ratings of unobserved user-item pairs toward 1 or below. >>> m3 = ranking_factorization_recommender.create(sf, target='rating', ... ranking_regularization = 0.1, ... unobserved_rating_value = 1) **Using the implicit alternating least squares model** Ranking factorization also implements implicit alternating least squares [1] as an alternative solver. This is enable using ``solver = 'ials'``. >>> m3 = ranking_factorization_recommender.create(sf, target='rating', solver = 'ials') See Also -------- :class:`graphlab.recommender.factorization_recommender.FactorizationRecommender`, :class:`graphlab.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` References ----------- [1] Collaborative Filtering for Implicit Feedback Datasets Hu, Y.; Koren, Y.; Volinsky, C. IEEE International Conference on Data Mining (ICDM 2008), IEEE (2008). """ _mt._get_metric_tracker().track( 'toolkit.recsys.ranking_factorization_recommender.create') method = 'ranking_factorization_recommender' opts = {'model_name': method} response = _graphlab.toolkits._main.run("recsys_init", opts) model_proxy = response['model'] if user_data is None: user_data = _graphlab.SFrame() if item_data is None: item_data = _graphlab.SFrame() if target is None: binary_target = True opts = { 'dataset': observation_data, 'user_id': user_id, 'item_id': item_id, 'target': target, 'user_data': user_data, 'item_data': item_data, 'nearest_items': _graphlab.SFrame(), 'model': model_proxy, 'random_seed': random_seed, 'num_factors': num_factors, 'regularization': regularization, 'linear_regularization': linear_regularization, 'ranking_regularization': ranking_regularization, 'binary_target': binary_target, 'max_iterations': max_iterations, 'side_data_factorization': side_data_factorization, 'num_sampled_negative_examples': num_sampled_negative_examples, 'solver': solver, # Has no effect here. # 'verbose' : verbose, 'sgd_step_size': sgd_step_size } if unobserved_rating_value is not None: opts["unobserved_rating_value"] = unobserved_rating_value if kwargs: try: possible_args = set(get_default_options()["name"]) except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) response = _graphlab.toolkits._main.run('recsys_train', opts, verbose) return RankingFactorizationRecommender(response['model'])
# coding: utf-8 # # Week 6: Deep features for image classification & retrieval # In[1]: import graphlab as gl # In[6]: image_train = gl.SFrame('image_train_data/') # In[28]: image_test = gl.SFrame('image_test_data/') # In[33]: gl.canvas.set_target('ipynb') # In[34]: image_test[0:1]['image'].show() # ## Creating category-specific image retrieval models # In[19]: image_train_bird = image_train[image_train['label'] == 'bird'] # In[20]:
print items.shape items.head() r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1') ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1') ratings_base.shape, ratings_test.shape import graphlab train_data = graphlab.SFrame(ratings_base) test_data = graphlab.SFrame(ratings_test) popularity_model = graphlab.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating') #Get recommendations for first 5 users and print them #users = range(1,6) specifies user ID of first 5 users #k=5 specifies top 5 recommendations to be given popularity_recomm = popularity_model.recommend(users=range(1, 6), k=5) popularity_recomm.print_rows(num_rows=25) ratings_base.groupby(by='movie_id')['rating'].mean().sort_values( ascending=False).head(20)
# Download data if you haven't already import graphlab as gl import os ''' "Topic models" are a class of statistical models for text data. These models typically assume documents can be described by a small set of topics, and there is a probability of any word occurring for a given "topic". ''' if os.path.exists('wikipedia_w0'): docs = gl.SFrame('wikipedia_w0') else: docs = gl.SFrame.read_csv('https://static.turi.com/datasets/wikipedia/raw/w0.csv', header=False) docs.save('wikipedia_w0') # Remove stopwords and convert to bag of words docs = gl.text_analytics.count_words(docs['X1']) docs = docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True) # Learn topic model model = gl.topic_model.create(docs) ''' You may get details on a subset of topics by supplying a list of topic names or topic indices, as well as restrict the number of words returned per topic. ''' print model.get_topics() ''' If we just want to see the top words per topic, this code snippet will rearrange the above SFrame to do that.
""" from __future__ import print_function import graphlab as gl import graphlab.aggregate as agg import os os.chdir('c:/users/admin/documents/github/uow/uow_mlf/week 4/') # GL Setup pk = open('c:/users/admin/documents/github/gl_product_key.txt','r').read() gl.product_key.set_product_key(pk) gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS',4) # Load wikipedia people data sf = gl.SFrame('people_wiki.gl/') # compute TF-IDF (term frequency / inverse document frequency) for the corpus (body of works) sf['word_count'] = gl.text_analytics.count_words(sf['text']) # Note: tfidf --> 0 for common words, and --> 1 for rare words tfidf = gl.text_analytics.tf_idf(sf['word_count']) # SFrame one-stop-shop sf['tfidf'] = tfidf ##======= Question 1 =======## # For Elton John, create word count and tfidf table and sorting on value descending elton = sf[sf['name'] == 'Elton John'] elton_wc_tbl = elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort(['count'],ascending=False) elton_tfidf_tbl = elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort(['tfidf'],ascending=False)
df2 = df[1] df3 = df1.append(df2,ignore_index = True) print '{}{}{}{}{}{}'.format("BIDROW length ",len(df1),' DF length ',len(df2),' DF3 ',len(df3)) print "Creating CSV and Updating..." df3.to_csv(BASE_DATA_PATH+'/trainingset3.csv', sep=',', encoding='utf-8') print("CSV Done !!!") ######################################################################################################### # Quick analysis using GraphLab classification library # Getting amazing result of 99.9 ######################################################################################################### #Shuffle the data https://dato.com/products/create/docs/generated/graphlab.toolkits.cross_validation.shuffle.html data = gl.SFrame('/Users/abhishekchoudhary/Work/python/facebook/trainingset3.csv') data = gl.cross_validation.shuffle(data) folds = gl.cross_validation.KFold(data, 5) for train, valid in folds: m = gl.boosted_trees_classifier.create(train, target='label') print m.evaluate(valid) # Get the i'th fold (train, test) = folds[4] #do a quick classification analysis on the dataset #https://dato.com/products/create/docs/graphlab.toolkits.classifier.html model = gl.classifier.create(train, target='outcome', features=['bidder_id', 'auction', 'merchandise','device','time','country','ip','url']) #https://dato.com/products/create/docs/generated/graphlab.toolkits.cross_validation.KFold.html#graphlab.toolkits.cross_validation.KFold #After above K-fold #https://dato.com/products/create/docs/generated/graphlab.boosted_trees_classifier.create.html
# -*- coding: utf-8 -*- # @Author: GaNeShKuMaRm # @Date: 2017-02-25 20:38:47 # @Last Modified by: GaNeShKuMaRm # @Last Modified time: 2017-02-25 20:51:27 import graphlab baseURL = "Dataset_SFrame/image_data_with_deepfeatures/df-" flag = True for i in range(1, 4): temp_data = graphlab.SFrame(baseURL + str(i)) if flag: image_data = temp_data flag = False else: image_data = image_data.append(temp_data) image_data.save('Dataset_SFrame/image_data') print "Completed!"
import sys # gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/")) model_path = "/data/hoytak/diabetic/models/models" train_sf = [] test_sf = [] feature_names = [] each_sf_feature_names = [] # for n in [0, "1b", '2b', 4]: for n in [0, 1, "1b", 2, '2b', 3, 4]: try: print "Loading %s" % str(n) Xf_train = gl.SFrame(model_path + "/scores_train_%s" % str(n)) Xf_test = gl.SFrame(model_path + "/scores_test_%s" % str(n)) sf_feature_names = [] for fn in Xf_train.column_names(): if fn.startswith("scores"): key = fn idx = 0 while key in feature_names: key = fn + ".%d" % idx idx += 1 if key != fn:
import graphlab as gl # Load the data ##data = gl.SFrame('https://static.turi.com/datasets/regression/yelp-data.csv') data = gl.SFrame('/Users/wei/code/pycharm/yelp-data.csv') # Restaurants with rating >=3 are good data['is_good'] = data['stars'] >= 3 # Make a train-test split train_data, test_data = data.random_split(0.8) # Automatically picks the right model based on your data. model = gl.classifier.create(train_data, target='is_good', features=[ 'user_avg_stars', 'business_avg_stars', 'user_review_count', 'business_review_count' ]) # Generate predictions (class/probabilities etc.), contained in an SFrame. predictions = model.classify(test_data) # Evaluate the model, with the results stored in a dictionary results = model.evaluate(test_data)
import graphlab sales = graphlab.SFrame('D:\Tasks\projects\python\casestudy\course1\home_data.gl/') train_data, test_data = sales.random_split(.8, seed=0) sqft_model = graphlab.linear_regression.create(train_data) print(train_data.head(5))
#a=dict(a,**z) try: del i['_id'] except: pass z.append(i) df = pd.DataFrame(data=z, dtype=object) df['price'] = df['price'].astype(str) df['name'] = df['name'].astype(str) df['type'] = df['type'].astype(str) df['description'] = df['description'].astype(str) print(df.dtypes) users = db.users.find({}) order = db.orders user = [] menu = graphlab.SFrame(df) menu['word_count'] = graphlab.text_analytics.count_words(menu['description']) tfidf = graphlab.text_analytics.tf_idf(menu['word_count']) menu['tfidf'] = tfidf knn_model = graphlab.nearest_neighbors.create(menu, features=['tfidf'], label='name') print type(users) for i in users: user = i['email'] print(user) orders = order.find({'email': user}) items = [] for i in orders: items += i['orders'] print type(items)