def main(): # create csvs to train models _load_data() is_df = gl.SFrame.read_csv(IS_DF_PATH) nmf_df = gl.SFrame.read_csv(NMF_DF_PATH) pop_df = gl.SFrame.read_csv(POP_DF_PATH) item_df = gl.SFrame(get.item_data(BEERS_PATH, BREWERS_PATH)) # list of beers to exclude from recs, by user exclude_beers = pd.read_csv(FTISO_PATH)[['user_id', 'beer_id']] exclude_beers.columns = ['user_id', 'item_id'] exclude_beers = gl.SFrame(exclude_beers) # build & save models build.is_model(is_df, IS_MODEL_PATH) build.nmf_model(nmf_df, item_df, NMF_MODEL_PATH) build.pop_model(pop_df, POP_MODEL_PATH) # load all models is_model = gl.load_model(IS_MODEL_PATH) nmf_model = gl.load_model(NMF_MODEL_PATH) pop_model = gl.load_model(POP_MODEL_PATH) # get recommendations & export as json users = pd.read_csv(USERS_PATH)['id'].to_frame(name='id') users = gl.SFrame(users) is_recs = is_model.recommend(users=users['id'], exclude=exclude_beers, diversity=3) nmf_recs = nmf_model.recommend(users=users['id'], exclude=exclude_beers, diversity=3) pop_recs = pop_model.recommend(users=users['id'], exclude=exclude_beers, diversity=3) # save recommendations is_recs.export_json(IS_REC_PATH, orient='records') nmf_recs.export_json(NMF_REC_PATH, orient='records') pop_recs.export_json(POP_REC_PATH, orient='records')
def load_models(): popularity_model_path = os.path.join(MODEL_DIR, 'popularity', 'popularity_model') item_model_path = os.path.join(MODEL_DIR, 'item', 'item_model') popularity_model = graphlab.load_model(popularity_model_path) item_model = graphlab.load_model(item_model_path) user_pkl_path = os.path.join(MODEL_DIR, 'user.pkl') item_pkl_path = os.path.join(MODEL_DIR, 'item.pkl') users = pd.read_pickle(user_pkl_path) items = pd.read_pickle(item_pkl_path) return popularity_model, item_model, users, items
def test_models(): ''' INPUT: None DESCRIPTION: Tests the model OUTPUT: None ''' list_methods = ['factorization_recommender', 'factorization_recommender', 'ranking_factorization_recommender'] model = Model(model=gl.load_model(fp+'item_similarity_recommender')) print model.sample_recommendation(20, 10) for model_name in list_methods: model.model = gl.load_model(model_name) print model.sample_recommendation(20,10)
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. Parameters ---------- type_tag : The name of the glc class as saved in the GLC pickler. gl_archive_abs_path: An absolute path to the GLC archive where the object was saved. Returns ---------- The GLC object. """ if type_tag == "SFrame": obj = _gl.SFrame(gl_archive_abs_path) elif type_tag == "SGraph": obj = _gl.load_graph(gl_archive_abs_path) elif type_tag == "SArray": obj = _gl.SArray(gl_archive_abs_path) elif type_tag == "Model": obj = _gl.load_model(gl_archive_abs_path) else: raise _pickle.UnpicklingError( "GraphLab pickling Error: Unspported object." " Only SFrames, SGraphs, SArrays, and Models are supported.") return obj
def test_exception(self): # load model from empty file with util.TempDirectory() as tmp_empty_file: with self.assertRaises(IOError): gl.load_model(tmp_empty_file) # load model from non-existing file if (os.path.exists('./tmp_model-%d' % temp_number)): shutil.rmtree('./tmp_model-%d' % temp_number) with self.assertRaises(IOError): gl.load_model('./tmp_model-%d' % temp_number) # save model to invalid url for url in ['http://test', '/root/tmp/testmodel']: with self.assertRaises(IOError): self.pr_model.save(url)
def evaluate_model(self, dataset): from graphlab import load_model model = None for folder in dataset.folders: model_file = self._get_model_file(dataset, folder) user_evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type = 'user') item_evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type = 'item') user = item = False if os.path.exists(user_evaluation_file): user = True print 'RecommendationModel ' + self.id + ' already evaluated by user in folder ' + folder.id + '.' if os.path.exists(item_evaluation_file): item = True print 'RecommendationModel ' + self.id + ' already evaluated by item in folder ' + folder.id + '.' if user and item: continue model = load_model(location = model_file) evaluation = model.evaluate(dataset = folder.train_sframe, metric = 'rmse') if not user: evaluation['rmse_by_user'].save(user_evaluation_file) if not item: evaluation['rmse_by_item'].save(item_evaluation_file)
def predict_options(options): """ Run predictions on potential options :param options: array of dictionary, expected format [{"user": __, "content.id": __}] :return: an array with predicted scores for each option; None if invalid """ # TODO - Need to format option in a way that makes sense for the predictor if os.path.exists(MODEL_LOCATION): model = gl.load_model(MODEL_LOCATION) else: logger.warn("couldn't load module, re-training", exc_info=True) model = train() if "user" in options[0] and "content_id" in options[0]: temp_users = [] temp_content = [] for option in options: temp_users.append(option["user"]) temp_content.append(option["content_id"]) users = gl.SArray(temp_users) content = gl.SArray(temp_content) frame = gl.SFrame({ "user": users, "content_id": content }, format="dict") prediction = model.predict(frame) logger.info("prediction: ", str(prediction)) else: logger.error( "options not in the correct format, expected key 'user' and key 'content_id'" ) prediction = None return list(prediction)
def __load_data_structure__(self, filepath): """Return data structure if can be loaded, otherwise returns None and logs warning""" # try to load different supported types, since don't know what type just try all and swallow exceptions obj = None try: obj = _gl.load_sframe(filepath) return obj except: pass try: obj = _gl.load_sgraph(filepath) return obj except: pass try: obj = _gl.load_model(filepath) return obj except: pass try: obj = _gl.SArray(data=filepath) return obj except: pass __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath) return None
def predict(modelname, jsonfile): filename, file_extension = os.path.splitext(modelname) fileList = filename.split('_') zipname = modelname + '.zip' download("models", zipname) zip_ref = zipfile.ZipFile(zipname, 'r') zip_ref.extractall(modelname) zip_ref.close() # predict # reqUserList = {"userList": ["A3SGXH7AUHU8GW"]} with open(jsonfile) as data_file: userDict = json.load(data_file) userList = userDict['userList'] resultList = {} if os.path.exists(modelname): model = gl.load_model(modelname) recommendedItemList = model.recommend(users=userList) for user in userList: outRowList = recommendedItemList[recommendedItemList['user_id'] == user] resultList[user] = list(outRowList['item_id']) print resultList return json.dumps(resultList) else: raise Exception('model does not exist.') return
def __init__(self, env, create_model=False, data_set=None): self._env = env self._model = None if not create_model: self._model = gl.load_model('data/FMmodel') print "FM policy model loaded."
def recommend(): # get user input input_boxes = ['beer_input1_id', 'beer_input2_id', 'beer_input3_id'] user_input = [] for box in input_boxes: submission = request.form[box] if submission != '': submission = int(submission) user_input.append(submission) user_input = pd.DataFrame({'user_id': [20000 for _ in user_input], 'item_id': user_input}) # identify user-favored styles for additional filtering beers = pd.read_csv('../data/input/beers.csv') user_styles = user_input.merge(beers, left_on='item_id', right_on='id')['style'] items = beers[beers['style'].isin(list(user_styles))] items = gl.SFrame(items)['id'] # load model and generate recommendations model = gl.load_model('../models/item_similarity_model') user_input = gl.SFrame(user_input) pred = list(model.recommend(users=[20000], items=items, k=5, new_observation_data=user_input, diversity=3)['item_id']) # format recommendations for output beer_recs = beers[beers['id'].isin(pred)] beer_recs = beer_recs[['name', 'brewery_name', 'style', 'score']] beer_recs.columns = ['brew', 'brewery', 'style', 'untappd score'] beer_recs = beer_recs.to_html(columns=['brew', 'brewery', 'style', 'untappd score'], index=False) beer_recs = beer_recs.replace('border="1" class="dataframe"', 'class=table table-hover') return render_template('index.html', recommend=True, beer_recs=beer_recs)
def __init__(self, features, model='auto', output_column_prefix=None): """ Parameters ---------- """ _raise_error_if_not_of_type(features, [str, list, type(None)]) _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier]) _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) if isinstance(features, str): features = [features] # Set the model. self._state = {} self._state["features"] = features if not output_column_prefix: output_column_prefix = "deep_features" self._state["output_column_prefix"] = output_column_prefix self._state['model'] = model if self._state["model"] == 'auto': model_path = \ "https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45" import graphlab as gl self._state['model'] = gl.load_model(model_path) if type(self._state['model']) is not _NeuralNetClassifier: raise ValueError( "Model parameters must be of type NeuralNetClassifier " + "or string literal 'auto'")
def predict(modelname, jsonfile): filename, file_extension = os.path.splitext(modelname) fileList = filename.split('_') zipname = modelname + '.zip' download("models", zipname) zip_ref = zipfile.ZipFile(zipname, 'r') zip_ref.extractall(modelname) zip_ref.close() # predict # reqUserList = {"userList": ["A3SGXH7AUHU8GW"]} with open(jsonfile) as data_file: userDict = json.load(data_file) userList = userDict['userList'] resultList = {} if os.path.exists(modelname): model = gl.load_model(modelname) recommendedItemList = model.recommend(users=userList) for user in userList: outRowList = recommendedItemList[recommendedItemList['user_id']==user] resultList[user] = list(outRowList['item_id']) print resultList return json.dumps(resultList) else: raise Exception('model does not exist.') return
def applyMF(path, model_num, limit = np.Inf): logger = logging.getLogger('signature.aMF') logger.info('starting applyMF') #get data r_file = path+'yelp_reviews_test_predictions.json' testReviews = list() for counter, line in enumerate(open(r_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) if counter > limit: break testReviews.append(json.loads(line.strip())) #load model model_path = path+'regularModels/matrixFactorization_%d.model'%model_num model = graphlab.load_model(model_path) #run function reviewsPrediction = applyMatrixFactorization(testReviews, model) #save result outfile = open(path+'yelp_reviews_test_predictions.json','w') for review in reviewsPrediction: outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') outfile.close()
def get_topics(): lda = gl.load_model('../lda_25topics') topics = lda.get_topics(output_type='topic_words') topic_names = [] for topic in topics: topic_names.append(' '.join(topic['words'])) return topic_names
def test_model(self, dataset, type_prediction = 'test'): from graphlab import load_model for folder in dataset.folders: prediction_file = self._get_prediction_file(dataset, folder, type_prediction) model_file = self._get_model_file(dataset, folder) if os.path.exists(prediction_file): print 'RecommendationModel ' + self.id + ' already tested in folder ' + folder.id + '.' continue elif not os.path.exists(model_file): print 'Impossible testing this model. It should be trained first.' return else: print 'Starting to test_model model ' + self.id + '.' model = load_model(location = model_file) if type_prediction == 'test': predictions = model.predict(dataset = folder.test_sframe) else: predictions = model.predict(dataset = folder.train_sframe) predictions.save(filename = prediction_file) print 'RecommendationModel ' + self.id + ' tested.'
def LoadModel(self, train, type, modelName, dir=None): myDir = os.getcwd() if dir is None else dir myModelName = myDir + "/" + modelName myModel = graphlab.load_model(myModelName) if train not in ['train', 'full', 'elite']: raise ValueError("train is not valid") if modelName not in ['ranking', 'factorization']: raise ValueError("bad modelName") if (train, modelName) == ('train', 'ranking'): self.trained_model['ranking'] = myModel elif (train, modelName) == ('train', 'factorization'): self.trained_model['factorization'] = myModel elif (train, modelName) == ('full', 'ranking'): self.full_model['ranking'] = myModel elif (train, modelName) == ('full', 'factorization'): self.full_model['factorization'] = myModel elif (train, modelName) == ('elite', 'ranking'): self.elite_model['ranking'] = myModel elif (train, modelName) == ('elite', 'factorization'): self.elite_model['factorization'] = myModel print("the model loaded from %s" % (myModelName))
def __init__(self, feature, model = 'auto', output_column_name=None): """ Parameters ---------- """ _raise_error_if_not_of_type(feature, [str]) _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier]) _raise_error_if_not_of_type(output_column_name, [str, _NoneType]) # Set the model. self._state = {} self._state["features"] = feature if not output_column_name: self._state["output_column_name"] = "deep_features_%s" % feature else: self._state["output_column_name"] = output_column_name self._state['model'] = model if self._state["model"] == 'auto': model_path = \ "http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45" import graphlab as gl self._state['model'] = gl.load_model(model_path) if type(self._state['model']) is not _NeuralNetClassifier: raise ValueError("Model parameters must be of type NeuralNetClassifier " + "or string literal 'auto'")
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. Parameters ---------- type_tag : The name of the glc class as saved in the GLC pickler. gl_archive_abs_path: An absolute path to the GLC archive where the object was saved. Returns ---------- The GLC object. """ if type_tag == "SFrame": obj = _gl.SFrame(gl_archive_abs_path) elif type_tag == "SGraph": obj = _gl.load_graph(gl_archive_abs_path) elif type_tag == "SArray": obj = _gl.SArray(gl_archive_abs_path) elif type_tag == "Model": obj = _gl.load_model(gl_archive_abs_path) else: raise _pickle.UnpicklingError("GraphLab pickling Error: Unspported object." " Only SFrames, SGraphs, SArrays, and Models are supported.") return obj
def _set_model(self, model): '''Extract supported methods from the model. Each model needs to implement a class method called _get_queryable_methods() that tells this Predictive Object whether or not it expects a SFrame, SArray or other type as input, the 'query' method of this class will automatically convert to appropriate SFrame or SArray that is needed. The model method can also expect either an SArray or an SFrame, for example, recommender.recommend() method could expect the first parameter 'user' to be either a list of users or an SFrame with more information regarding the users. For example, recommender model would return the following information: {'predict': { 'dataset': 'sframe', 'new_observation_data': 'sframe', 'new_user_data': 'sframe', 'new_item_data': 'sframe' }, 'recommend': { 'users': ['sframe', 'sarray'] 'items': ['sframe', 'sarray'], 'new_observation_data': 'sframe', 'new_user_data': 'sframe', 'new_item_data': 'sframe', 'exclude': 'sframe'} } ''' if is_path(model): # This is a path, download the file and load it model = graphlab.load_model(model) self.model = model self._model_methods = model._get_queryable_methods() if type(self._model_methods) != dict: raise RuntimeError("_get_queryable_methods for model %s should return a \ dictionary" % model.__class__) for (method, description) in self._model_methods.iteritems(): if type(description) != dict: raise RuntimeError("model %s _get_queryable_methods should use dict as method\ description."% model.__class__) for (param_name, param_types) in description.iteritems(): # support either "sframe", "sarray" or ["sframe", "sarray"] if not isinstance(param_types, list): param_types = [param_types] for param_type in param_types: if (param_type not in ['sframe', 'sarray']): raise RuntimeError("model %s _get_queryable_methods should only use \ 'sframe' or 'sarray' type. %s is not supported" % (model.__class__, param_type)) description.update({param_name: param_types}) self._model_methods.update({method: description})
def mostPopular(self, topk): items = self.items model = gl.load_model('models/Popular') reco = model.recommend_from_interactions( items[items['Score'] > 4].remove_column('UserId'), k=topk, items=items[items['Score'] > 2].select_column('ProductId')) return self.getData(reco)
def load_model(model): """ input: model name in str format output: loaded model """ loaded_model = graphlab.load_model(model) return loaded_model
def load_topic_model(model_path): logger_9988.info('load_topic_model {} begin ...'.format(model_path)) global model_instance if not model_instance: model_instance = TopicModel() model_instance.version = os.path.split(model_path)[-1] model_instance.model = gl.load_model(model_path) logger_9988.info('load_topic_model finished!')
def RandomForestfunc(): data = gl.SFrame.read_csv('Phase2_data/mergedFeaturesMod.csv') model = gl.load_model('Phase2_codes/Random_Forest_Model') predictions = model.predict(data) results = model.evaluate(data) print results predictions.save('Phase2_data/ItemsBought.csv', format='csv')
def load_models(models_dir): global g_channel_kmeans_model_dict, model_v import os model_v = os.path.split(models_dir)[1] if len(g_channel_kmeans_model_dict) != 0: g_channel_kmeans_model_dict.clear() models_files = os.listdir(models_dir) for mf in models_files: g_channel_kmeans_model_dict[mf] = gl.load_model(models_dir + '/' + mf)
def whatsTrending(self, topk): trends = self.trends[:5000] model = gl.load_model('models/Trending') reco = model.recommend_from_interactions( trends[trends['Score'] > 4][:10].remove_column('UserId'), k=topk, items=trends[trends['Score'] > 3][100:1100].select_column( 'ProductId')) return self.getData(reco)
def load_model(self, model, model_cols, user_col, item_col, listen_col=None): if type(model) == str: self.model = gl.load_model(model) else: self.model = model self.model_cols = model_cols self.user_col = user_col self.item_col = item_col self.listen_col = listen_col
def build_prediction_results(topic_count, model_file_name): """ Writes the results from the model build phase into html files Creates files for each topic and its paragraphs and their probabilities and file for each topic and its belonging words :param topic_count: number of topics :param model_file_name: the file name of the model for loading it :return: None """ model = gl.load_model(model_file_name) root_results_dir = 'my-privacypolicy-thesis/results{}'.format(topic_count) if os.path.exists(root_results_dir): shutil.rmtree(root_results_dir) os.makedirs(root_results_dir) results_html_file = open(root_results_dir + "/results.html", "w+") results_html_file.write( "<html><table border='1'><tr><td>Topic Number</td><td>Words</td><td>Paragraphs</td></tr>" ) # Creates html file for each topic and its belonging words print('started phase 1 of build predictions results') paragraphs_html_list = [] for i in range(topic_count): paragraphs_html_list.append("<html><table border='1'>") words_list = model.get_topics(num_words=20, output_type='topic_words')['words'][i] print_words_list = ', '.join(words_list) paragraphs_url = "<a href='./paragraphs_topic_{}.html'>paragraphs</a>".format( i) results_html_file.write( "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format( i, print_words_list, paragraphs_url)) print('{} out of {}'.format(i + 1, topic_count)) results_html_file.write("</table></html>") results_html_file.close() # Creating html files for each topic and its paragraphs and their probabilities print('started phase 2 of build predictions results') for topic_id in range(0, topic_count): results_records = db_utils.db_select( "select probability,paragraph from privacy_policy_paragraphs_prediction " "where running_id= {} and topic_id={} " "group by paragraph,probability " "order by probability desc".format(topic_count, topic_id)) topic_html = "<html><table border='1'><tr><td>Probability</td><td>Paragraph</td></tr>" for results_record in results_records: topic_html += "<tr><td>{:.4f}</td><td>{}</td></tr>".format( results_record.get('probability'), results_record.get('paragraph')) topic_html += "</table></html>" paragraphs_html_file = open( root_results_dir + "/paragraphs_topic_{}.html".format(topic_id), "w+") paragraphs_html_file.write(topic_html) print('{} out of {}'.format(topic_id + 1, topic_count)) print("done")
def comments_sentimenting(book_id): comments_data = graphlab.load_sframe('helper/coeffi_comments_data') sentiment_model = graphlab.load_model( 'helper/books_comments_sentiment_model') commentsFromABook = comments_data[comments_data['book_id'] == int(book_id)] commentsFromABook['predicted_sentiment'] = sentiment_model.predict( commentsFromABook, output_type='probability') # comments_data['predicted_sentiment'] = sentiment_model.predict(comments_data, output_type='probability') return commentsFromABook.sort('created_time', ascending=True)
def __init__(self, filePath='../../data/ranking_factorization_recommender', model=None): ''' INPUT: String DESCRIPTION: Loads and saves the model given a filepath OUTPUT: None ''' if model == None: self.model = gl.load_model(filePath) else: self.model = model
def recommend(userid=None): if request.method=='POST': userid = request.args.get('userid') model = gl.load_model('../models/pickled_models/mf_model') recs = model.recommend(users=[str(userid)], k=5) perfume_id = [str(i) for i in recs['perfume_id']] rec_perfumes = list(collection.find({'perfume_id': {'$in': perfume_id}}, {'item_name':1, 'brand':1, 'gender':1, 'note':1, 'tags':1, 'theme':1, '_id':0})) return render_template('recommend.html', rec_perfumes=rec_perfumes)
def _load_graphlab_object(cls, obj_type, obj_path): if obj_type == 'model': return graphlab.load_model(obj_path) elif obj_type == 'sarray': return graphlab.SArray(obj_path) elif obj_type == 'sframe': return graphlab.load_sframe(obj_path) elif obj_type == 'sgraph': return graphlab.load_sgraph(obj_path) else: raise RuntimeError(str(obj_type) + ' is not supported')
def load_model(location): if not os.path.exists(location): raise IOError(location + ' does not exist') with open(location+"/data.json", "r") as f: data = json.load(f) lst = [gl.load_model(location+"/"+f) for f in os.listdir(location) if f != 'data.json'] return Ensemble(lst, weights=data['weights'], vote_fn=data['vote_fn'])
def test_exception(self): self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
def popular(cur): import graphlab as gl model = gl.load_model("mymodel1") l = [] nn = model.recommend_from_interactions([5000], k=10) for i in nn: rows = cur.execute("select name from anime where anime_id = (?)", [i['anime_id']]).fetchone() l.append((rows[0], i['rank'])) return l
def main(): # load selected photos photos = graphlab.SFrame('photos_las_vegas_food_drinks.gl') # load AlexNet model pre-trained and provided by Dato alexnet_model = graphlab.load_model('imagenet_model_iter45.gl') # extract and save deep features of selected photos photos['deep_features'] = alexnet_model.extract_features(photos) photos.save('photos_deep_features.gl')
def main(): # load and save photos in graphlab format photos = graphlab.image_analysis.load_images( 'photos_las_vegas_food_drinks') photos.save('photos_las_vegas_food_drinks.gl') # load and save AlexNet model pre-trained and provided by Dato deep_learning_model = graphlab.load_model( 'http://s3.amazonaws.com/GraphLab-Datasets/deeplearning/imagenet_model_iter45' ) deep_learning_model.save('imagenet_model_iter45.gl')
def test_exception(self): self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world')) self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
def fetch_mainmodel(): ''' INPUT: - None: None OUTPUT: - None: None Graphlab allows you to store your model, similar to storing a model as a pickle. This function will grab and load the chosen model. ''' return gl.load_model('game_recommender')
def test_basic_save_load(self): # save and load the pagerank model with util.TempDirectory() as tmp_pr_model_file: self.pr_model.save(tmp_pr_model_file) pr_model2 = gl.load_model(tmp_pr_model_file) self.__assert_model_equals__(self.pr_model, pr_model2) # save and load the connected_component model with util.TempDirectory() as tmp_cc_model_file: self.cc_model.save(tmp_cc_model_file) cc_model2 = gl.load_model(tmp_cc_model_file) self.__assert_model_equals__(self.cc_model, cc_model2) # handle different types of urls. # TODO: test hdfs and s3 urls. for url in ['./tmp_model-%d' % temp_number, '/tmp/tmp_model-%d' % temp_number, 'remote:///tmp/tmp_model2-%d' % temp_number]: self.pr_model.save(url) self.__assert_model_equals__(self.pr_model, gl.load_model(url))
def main(): model_path = 'dato_model/neuralnet.model' model = gl.load_model(model_path) mnist_path = os.path.expanduser("~/model-serving/data/mnist_data") X, y = load_digits(mnist_path, "test-mnist-dense-with-labels.data") first_x = X[1] data = gl.SFrame(first_x) data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3) fs = model.extract_features(xxx) print fs
def visualize(self): # Start at your instinct self.items.show() training_data, test_data = self.items.random_split(0.8, seed=0) model = gl.load_model("item-model1") pred = model.predict(test_data) results = model.evaluate(test_data) print(results) view = model.views.overview(validation_set=test_data) view.show() gl.evaluation.rmse(self.validation_data) view = model.views.overview(validation_set=self.validation_data) view.show()
def extract_image_features(): # Used graphlab.neuralnet_classifier.NeuralNetClassifier.extract_features which takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id]. #Used a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45 items = graphlab.SFrame.read_json('../data/items.json') #Remove duplicate rows of the SFrame items = items.unique() items.remove_column('image') #Split data by category phones = items.filter_by(['Cell Phones'], 'category_name') apparel = items.filter_by(['Baby & Kids', 'Clothing & Shoes'], 'category_name') home = items.filter_by(['Furniture', 'Household', 'Home & Garden'], 'category_name') #Load images phone_images = graphlab.image_analysis.load_images('data/images_by_category/Cell Phones', "auto", with_path=True, recursive=True) baby_images = graphlab.image_analysis.load_images('data/images_by_category/Baby & Kids', "auto", with_path=True, recursive=True) clothing_images = graphlab.image_analysis.load_images('data/images_by_category/Clothing & Shoes', "auto", with_path=True, recursive=True) furniture_images = graphlab.image_analysis.load_images('data/images_by_category/Furniture', "auto", with_path=True, recursive=True) household_images = graphlab.image_analysis.load_images('data/images_by_category/Household', "auto", with_path=True, recursive=True) home_garden_images = graphlab.image_analysis.load_images('data/images_by_category/Home & Garden', "auto", with_path=True, recursive=True) apparel_images = baby_images.append(clothing_images) home_images = furniture_images.append(household_images).append(home_garden_images) phone_images['id'] = phone_images['path'].apply(get_id) apparel_images['id'] = apparel_images['path'].apply(get_id) home_images['id'] = home_images['path'].apply(get_id) phones_with_images = phones.join(phone_images, on='id', how='inner') apparel_with_images = apparel.join(apparel_images, on='id', how='inner') home_with_images = home.join(home_images, on='id', how='inner') #Split data into train and test select_features phones_train, phones_test = phones_with_images.random_split(.8, seed=0) apparel_train, apparel_test = apparel_with_images.random_split(.8, seed=0) home_train, home_test = home_with_images.random_split(.8, seed=0) #Used the neural network trained on the 1.2 million images of the ImageNet Challenge. deep_learning_model = graphlab.load_model('../data/imagenet_model') phones_train['deep_features'] = deep_learning_model.extract_features(phones_train) apparel_train['deep_features'] = deep_learning_model.extract_features(apparel_train) home_train['deep_features'] = deep_learning_model.extract_features(home_train) phones_test['deep_features'] = deep_learning_model.extract_features(phones_test) apparel_test['deep_features'] = deep_learning_model.extract_features(apparel_test) home_test['deep_features'] = deep_learning_model.extract_features(home_test) #Store into data folder phones_train.save('data/phones_train')
def train_model(filename): # load already prepared data in form of an SFrame image_train = graphlab.SFrame(filename) # load the pre-trained model loaded_model = graphlab.load_model('model/') # extract features of the model on the given pictures image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train)) # add ids to the SFrame to be able to find the closest images ids = SArray(list(range(0,len(image_train)))) image_train.add_column(ids, name='id') # print image_train.head() # train the NN model on the extracted features knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id') return knn_model, image_train
def test_basic_save_load(self): # save and load the pagerank model with util.TempDirectory() as tmp_pr_model_file: self.pr_model.save(tmp_pr_model_file) pr_model2 = gl.load_model(tmp_pr_model_file) self.__assert_model_equals__(self.pr_model, pr_model2) # save and load the connected_component model with util.TempDirectory() as tmp_cc_model_file: self.cc_model.save(tmp_cc_model_file) cc_model2 = gl.load_model(tmp_cc_model_file) self.__assert_model_equals__(self.cc_model, cc_model2) # handle different types of urls. # TODO: test hdfs and s3 urls. for url in [ './tmp_model-%d' % temp_number, '/tmp/tmp_model-%d' % temp_number, 'remote:///tmp/tmp_model2-%d' % temp_number ]: self.pr_model.save(url) self.__assert_model_equals__(self.pr_model, gl.load_model(url))
def _get_recommened_movie_ids(): if not request.user.is_authenticated(): avg_scores = Rating.objects. \ exclude(item__in=excluded_items).values('item'). \ annotate(average_rating=Avg('rating')) top_items = avg_scores. \ order_by('-average_rating', 'item')[:cnt] return [item['item'] for item in top_items] else: cf_model = graphlab.load_model('cf_model') recomm = cf_model.recommend( users=[request.user.id], k=int(cnt), items=list(set(all_items) - set(excluded_items))) return recomm['item']
def test_exception(self): bad_url = "hdfs:///root/" if self.has_hdfs: self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile)) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model")) else: logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
def get_wines_for_movie(movie): path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf' path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf' wine_rec = gl.load_model(path_to_wine) movies_sf = gl.load_sframe(path_to_movies) cols = movies_sf.column_names() movies_df = movies_sf.to_dataframe() ids = [i for i in movies_df.index] movies_df.insert(0, 'id', ids) value_vars = [x for x in movies_df.columns if x != 'id'] movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value') movie_pos = movie_order_dict[movie] sims = pairwise_distances(np.array(movies_rec.coefficients['variable']['factors'])[movie_pos].reshape(1,-1), np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8], metric='cosine') wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name']) return wine_names[np.argsort(sims[0])[::-1]][:5]
def test_exception(self): if self.has_s3: bad_bucket = "i_am_a_bad_bucket" prefix = "s3://" + bad_bucket self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model")) else: logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
def main(): # Cursor to extract userId and business from all reviews reviews_cursor = tags_collection.find() reviews_cursor.batch_size(1000) # Create userList and businessList to store unique User Id userList=[] businessList=[] #Create an LDA model from graphlab if os.path.exists("ldamodel"): print "reading from the file" lda=gl.load_model("ldamodel") else: lda = create() # Go through all reviews in the Collection and Add userId and business to userList counter=0 for review in reviews_cursor: userList.append(review["userId"]) businessList.append(review["business"]) counter = counter + 1 if counter % 100000 == 0: print str(counter) + 'Record Read from reviews collection' # Remove duplicate business and userId from the List userList=Set(userList) userList=list(userList) businessList=Set(businessList) businessList=list(businessList) print 'Number of User in Dataset' + str(len(userList)) print 'Number of Business in Dataset' + str(len(businessList)) # Process User Review to create User Profile count = len(userList) worker(1, 0, count,userList,"userId", lda) #Process Business Review to create Business Profile count = len(businessList) worker(1, 0, count,businessList,"business", lda)
def _test_save_load_object_helper(testcase, obj, url): """ Helper function to test save and load a server side object to a given url. """ def cleanup(url): """ Remove the saved file from temp directory. """ protocol = None path = None splits = url.split("://") if len(splits) > 1: protocol = splits[0] path = splits[1] else: path = url if not protocol or protocol is "local" or protocol is "remote": tempdir = tempfile.gettempdir() pattern = path + ".*" for f in os.listdir(tempdir): if re.search(pattern, f): os.remove(os.path.join(tempdir, f)) if isinstance(obj, graphlab.SGraph): obj.save(url + ".graph") newobj = graphlab.load_graph(url + ".graph") testcase.assertItemsEqual(obj.get_fields(), newobj.get_fields()) testcase.assertDictEqual(obj.summary(), newobj.summary()) elif isinstance(obj, graphlab.Model): obj.save(url + ".model") newobj = graphlab.load_model(url + ".model") testcase.assertItemsEqual(obj.list_fields(), newobj.list_fields()) testcase.assertEqual(type(obj), type(newobj)) elif isinstance(obj, graphlab.SFrame): obj.save(url + ".frame_idx") newobj = graphlab.load_sframe(url + ".frame_idx") testcase.assertEqual(obj.shape, newobj.shape) testcase.assertEqual(obj.column_names(), newobj.column_names()) testcase.assertEqual(obj.column_types(), newobj.column_types()) assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(), newobj.head(newobj.num_rows()).to_dataframe()) else: raise TypeError cleanup(url)
def get_wine_recs(ratings): path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf' path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf' wine_rec = gl.load_model(path_to_wine) movies_sf = gl.load_sframe(path_to_movies) movies_df = movies_sf.to_dataframe() value_vars = [x for x in movies_df.columns if x != 'id'] new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings} new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1, np.nan) movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True) ids = [i for i in movies_df.index] movies_df.insert(0, 'id', ids) movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id = 'id', item_id='variable', target='value') movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec) wine_item_factors = np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8] wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name']) comb = np.dot(np.array(movies_user_factors[-1]), wine_item_factors.T) return wine_names[np.argsort(comb)[::-1]]
def main(): parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.") parser.add_argument("--dataset_dir", required = False, default=None ,type=str, help = "Dataset directory ex: my_dataset_test or my_dataset ") parser.add_argument("--classified_dir", required = True, default=None ,type=str, help = "Directory for dataset after classification ex: result_dataset") parser.add_argument("--print", required = False ,action='store_true', dest='print_results', help = "") args = parser.parse_args() if args.dataset_dir: vec_model = word2vec.Word2Vec.load_word2vec_format('word2vec_model.txt',binary=False) cls = gl.load_model("graphlab/my_classifier") dataset = gl.load_sframe(args.dataset_dir) result171_dataset = test_classifier(cls,dataset,vec_model) dataset.add_column(result171_dataset.select_column("class"),"class") dataset.add_column(result171_dataset.select_column("probability"),"probability") dataset.save(args.classified_dir) elif args.classified_dir: result171_dataset = gl.load_sframe(args.classified_dir) if args.print_results: print_positives_and_confidence(result171_dataset,result171_dataset)