def loadNeigh_2(): print 'loadNeighbor 2' zhima_usr = loadZhima() usr_id = zhima_usr['snwb'] #loadZhima_1 subgraph_edges = gl.load_sframe( os.path.join(resultDataFolder, 'subgraph_zhima_1')) #add zhima 2_neighbor for subgraph neighbor_1 = subgraph_edges['dst'] sub_vertices = usr_id.append(neighbor_1) # sframeFiles = os.listdir(sFrameFolder) for sf in sframeFiles: edgesData = gl.load_sframe(os.path.join(sFrameFolder, sf)) edgesData.rename({'X1': 'src', 'X2': 'dst'}) # zhima_neighbors_2 = edgesData.filter_by(sub_vertices,'src') zhima_neighbors_2 = edgesData[(edgesData['src'] in sub_vertices) & (edgesData['dst'] in sub_vertices)] subgraph_edges.append(zhima_neighbors_2) print sf print 'save subgraph' subgraph_edges.save(os.path.join(resultDataFolder, 'subgraph_zhima_2'))
def get_song_recs(ratings, n_features): ''' Takes user new movie ratings from website user and returns recommended song titles ''' path_to_songs_sf = '/home/cully/Documents/capstone/data/flask_songs_sf' path_to_movies_sf = '/home/cully/Documents/capstone/data/flask_movies_sf' songs_sf = gl.load_sframe(path_to_songs_sf) songs_df = songs_sf.to_dataframe() value_vars = [x for x in songs_df.columns if x != 'id'] ids = [x for x in songs_df.index] if 'id' not in songs_df.columns: songs_df.insert(0, 'id', ids) songs_melted = gl.SFrame(pd.melt(songs_df, id_vars = 'id', value_vars=value_vars)) songs_rec = gl.factorization_recommender.create(songs_melted, user_id = 'id', item_id='variable', target='value', num_factors = n_features) _, _, songs_item_intercept, songs_item_factors, songs_intercept = get_rec_coeffs(songs_rec) movies_sf = gl.load_sframe(path_to_movies_sf) movies_df = movies_sf.to_dataframe() value_vars = [x for x in movies_df.columns if x != 'id'] new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings} new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1,np.nan) movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True) ids = [str(i) for i in movies_df.index] movies_df.insert(0, 'id', ids) movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value', num_factors=n_features) movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec) comb = np.dot(np.array(movies_user_factors)[-1], np.array(songs_item_factors).T) return songs_df.columns[1:][np.argsort(comb)[::-1]]
def read_data(): # Load image analysis datasets. # Data was reduced to 6 categories in 3 groups: phones, home # (Furniture, Household, Home & Garden), apparel # (Baby & Kids, Clothing & Shoes). phones_set = graphlab.load_sframe(DATA_PATH + 'phones_with_ids') home_set = graphlab.load_sframe(DATA_PATH + 'home_with_ids') apparel_set = graphlab.load_sframe(DATA_PATH + 'apparel_with_ids') return phones_set, home_set, apparel_set
def __init__(self,version,session_id,diff): if(int(version) == 2): sf = gl.load_sframe("data/py2_session_clean") else: sf = gl.load_sframe("data/py3_session_clean") self.session = sf.filter_by([int(session_id)],"session_id") self.python_version = version self.session_id = session_id if(diff == "Full"): self.diffs = False else: self.diffs = True
def pagerank(): sf = graphlab.load_sframe('/users/erwin/work/ml_datasets/freebase_performances.csv') print sf g = graphlab.SGraph() g = g.add_edges(sf, 'actor_name', 'film_name') pr = graphlab.pagerank.create(g) print(pr.get('pagerank').topk(column_name='pagerank'))
def load_data(): data = None if os.path.exists(DATA_SFRAME): print "\n----- Loading SFrame -----" print " filename: ", DATA_SFRAME print "--------------------------\n" data = gl.load_sframe(DATA_SFRAME) else: print "\n----- Creating SFrame -----" print " filename: ", DATA_FILE print "---------------------------\n" raw_data = {'user_id': [], 'item_id': [], 'rating': []} with open(DATA_FILE, 'r') as f: lines = f.readlines()[0].split('\r') for line in lines: raw_data['user_id'].append(int(line.split('\t')[0])) raw_data['item_id'].append(int(line.split('\t')[1])) raw_data['rating'].append(int(line.split('\t')[2])) data = gl.SFrame(raw_data) print "\n----- Saving SFrame -----" print " filename: ", DATA_SFRAME print "------------------------\n-" data.save(DATA_SFRAME) return data
def predict(self, ratings, user, n=5): book_data = gl.load_sframe("./book_data_clean/") ids_ratings = self.getRecommendations(ratings, user, n+50) #list storing details of recommended books list_of_books = [] list_of_ids = [] # Serach a book via its id in book_data and append all its details along with rating to list_of_books count = 0 for item in ids_ratings: if count == n: break # if book details not present in book_data, skip over to next until (n) books are appended to list if item[0] not in book_data["book_id"]: continue count += 1 book = book_data[book_data["book_id"] == item[0]][0] if item[1] > 10: book["rating"] = 10 else: book["rating"] = item[1] # append id to another list and delete book id from dictionary list_of_ids.append(book["book_id"]) del(book["book_id"]) del(book["rating"]) list_of_books.append(book) return list_of_books[0:n], list_of_ids[0:n]
def __load_data_structure__(self, filepath): """Return data structure if can be loaded, otherwise returns None and logs warning""" # try to load different supported types, since don't know what type just try all and swallow exceptions obj = None try: obj = _gl.load_sframe(filepath) return obj except: pass try: obj = _gl.load_sgraph(filepath) return obj except: pass try: obj = _gl.load_model(filepath) return obj except: pass try: obj = _gl.SArray(data=filepath) return obj except: pass __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath) return None
def main(args): # Load the dataset. sf = gl.load_sframe(args.dataset) # Get the pos_tag count column names. tag_cols = [i for i in sf.column_names() if i.startswith('pos_count')] # Set up some distance metrics dists = [[('unigrams', 'bigrams'), 'jaccard', 1], [('pos_bigrams',), 'weighted_jaccard', 1], [('doc_vecs',), 'cosine', 1], [tuple(['time'] + tag_cols), 'euclidean', 1] ] feats = [] [feats.extend(list(i[0])) for i in dists] # If a valid sample size was provided # then replace the full dataset with a sample. if 0. < args.sample_size < 1.: sf = sf.sample(args.sample_size) # Create and fit the model. nnh = NNGraphHierarchy() nnh.fit( sf, label=args.label, features=feats, dist=dists, split_column=args.split_column, window_size=args.win_size, window_offset=args.win_offset, path=args.output, quantile=args.quantile, k=args.num_neighbors, radius=args.radius, ) # Save the results. nnh.sf.save(args.output) nnh.g.save(args.output+'.graph') # If a path to rumor-related tweets was provided # then run an analysis of rumor-tweet distribution # across top-level components. if args.rel_path: # Load the list of related tweet ids for each rumor. related = gl.SFrame.read_csv(args.rel_path) rumor_report = rumor_component_distribution( nnh.sf, related, ) rumor_report.save(args.output + 'rumor_report.csv', format='csv') # Save a report containing various information about # the top-level components. #hier_report = top_level_report(nnh.sf) #hier_report.save(args.output + '_hier_report.csv') print 'Success!' exit()
def loadData(): edgesData = gl.load_sframe(sframeDataFolder) print 'num_rows:%d ' %edgesData.num_rows() #create graph G = gl.SGraph() G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst') pritn 'create graph done!' return G
def load_all_sframes(repos_path, edges_path, nn_items_path=None , nn_text_path=None): """ uploads all the precomputed sframes, which are the following: repos: sframe containing repo_name as unique id, readme, language, watchers, etc... edges: sframe containing the edges between repos and watchers, along with the weights nn_items: precomputed nearest neighbors for all the repos """ repos = gl.load_sframe(repos_path) edges = gl.load_sframe(edges_path) nn_items = None nn_text = None if nn_items_path: nn_items = gl.load_sframe( nn_items_path ) if nn_text_path: nn_text = gl.load_sframe( nn_text_path ) return repos, edges, nn_items, nn_text
def build_docs_for_modeling(in_docs, sframe_raw_filename): # Remove stop words and convert to bag of words in_docs = gl.text_analytics.count_words(in_docs['X1']) in_docs = in_docs.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True) freq_words = get_freq_words(gl.load_sframe(sframe_raw_filename)) in_docs = in_docs.dict_trim_by_keys(freq_words['word'], exclude=False) in_docs = in_docs.dict_trim_by_keys(['information', 'data', 'privacy'], exclude=True) return in_docs
def main(): loaded_frame = graphlab.load_sframe("py2_session_clean") check = [25] test = loaded_frame.filter_by(check, 'session_id') for e in test: print e
def comments_sentimenting(book_id): comments_data = graphlab.load_sframe('helper/coeffi_comments_data') sentiment_model = graphlab.load_model( 'helper/books_comments_sentiment_model') commentsFromABook = comments_data[comments_data['book_id'] == int(book_id)] commentsFromABook['predicted_sentiment'] = sentiment_model.predict( commentsFromABook, output_type='probability') # comments_data['predicted_sentiment'] = sentiment_model.predict(comments_data, output_type='probability') return commentsFromABook.sort('created_time', ascending=True)
def from_previous_reduction(cls, input_dir): parent = gl.load_sgraph(input_dir+'parent') verticy_descriptions = gl.load_sframe(input_dir+'verticy_descriptions') child = gl.load_sgraph(input_dir+'child') gw = cls() gw.g = parent gw.verticy_descriptions = verticy_descriptions gw.child = cls() gw.child.g = child return gw
def test_exception(self): self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
def save_positive_results_with_event_type_and_date(result_dataset): csvfile = "classification/data/extraction_fields.tsv" with codecs.open(csvfile, "r", encoding="utf8") as infile: lines = infile.readlines() #types = set() #for line in lines[5:]: # types.add(line.split(",")[2].strip().lower()) sf = gl.load_sframe("graphlab/my_training_dataset") # lines[-1].split("\t") = [u'620', u'E1', u'protest', u'', u'', u'T2', u'NGO' ...] size = int(lines[-1].split("\t") [0]) + 1 # latest news index 620 starts from 0 do 620+1 labels = [0] * size for line in lines: fields = line.split("\t") key = fields[2].strip().lower() if key: ind = int(fields[0].strip()) labels[ind] = types[key] #rel_folder="classification/data/v6/class_rel/" ef = sf.filter_by([1], "rel") # add_arguments(None,rel_folder,1,vec_model) ef['event_type'] = ef['filenames'].apply(lambda p: labels[int(p[1:5])]) # evnt type classifier event_type_cls = gl.classifier.create( ef, target="event_type", features=['vectors', '1gram features']) pos_results = result_dataset.filter_by([1], "class") pos_res_res = event_type_cls.classify(pos_results) pos_results.add_column(pos_res_res.select_column("class"), "event_type") pos_results.add_column(pos_res_res.select_column("probability"), "et_probability") pos_results.filter_by([5], "event_type") pos_results['date'] = pos_results['filenames'].apply( lambda x: x[:-5].split('_')) pos_results = pos_results.unpack('date') pos_results.rename({ 'date.0': 'year', 'date.1': 'month', 'date.2': 'day', 'date.3': 'index' }) pos_results['year'] = pos_results['year'].apply( lambda year_str: int(year_str)) pos_results['month'] = pos_results['month'].apply(lambda m_str: int(m_str)) pos_results.save("graphlab/pos_results") ##_2005")
def from_previous_reduction(cls, input_dir): parent = gl.load_sgraph(input_dir + 'parent') verticy_descriptions = gl.load_sframe(input_dir + 'verticy_descriptions') child = gl.load_sgraph(input_dir + 'child') gw = cls() gw.g = parent gw.verticy_descriptions = verticy_descriptions gw.child = cls() gw.child.g = child return gw
def process_frame(filename): sf = gl.load_sframe(filename) output_frame = SFrame() #Setup our output frame id = [] ip = [] sub_count = [] error_count = [] time_count = [] error_sequence_raw = [] error_sequence = [] #How many session ID's do we have? sa = gl.SArray() sa = sf['session_id'] test = sa.unique() limit = len(test) #Start grabbing each session for i in range(1,limit): #Test output if (i % 100 == 0): break #Get the session and sort it by the date time session_frame = sf.filter_by(i,"session_id") #sorted_session = session_frame.sort("dt") row = sf[0] id += [i] ip += [row['ip']] sub_count += [len(row)] #time_count += [fn_time_count(sorted_session)] #error_count += [fn_error_count(sorted_session)] #error_sequence_raw += [fn_error_sequence_raw(sorted_session)] print len(id) print len(ip) print len(sub_count) #print len(time_count) output_frame = output_frame.add_column(SArray(id), name='id') output_frame.add_column(SArray(ip), name='ip') output_frame.add_column(SArray(sub_count),name='sub_count') #output_frame.add_column(SArray(time_count),name='sub_length') #output_frame.add_column(SArray(error_count),name='error_count') #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw') output_frame.save('py2_session_analysis')
def _load_graphlab_object(cls, obj_type, obj_path): if obj_type == 'model': return graphlab.load_model(obj_path) elif obj_type == 'sarray': return graphlab.SArray(obj_path) elif obj_type == 'sframe': return graphlab.load_sframe(obj_path) elif obj_type == 'sgraph': return graphlab.load_sgraph(obj_path) else: raise RuntimeError(str(obj_type) + ' is not supported')
def load(infile): ''' Reads a binary format SFrame from GL_DATA/ args: infile - name of a graphlab binary to read from GL_DATA/ return: the SFrame stored at GL_DATA/infile ''' path = os.path.join(GL_DATA, infile) return gl.load_sframe(path)
def test_exception(self): self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world')) self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof')) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....')) self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model")) self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
def loadSubData(): sframeFiles = os.listdir('/home/lt/sframe') edgesData = gl.sFrame() for sf in sframeFiles: edgesData.append(gl.load_sframe('/home/lt/sframe',sf)) edgesData.rename({'X1':'src','X2':'dst'}) #create graph G = gl.SGraph() G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst') pritn 'create graph done!' return G
def main(): parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.") parser.add_argument("--dataset_dir", required = False, default=None ,type=str, help = "Dataset directory ex: my_dataset_test or my_dataset ") parser.add_argument("--classified_dir", required = True, default=None ,type=str, help = "Directory for dataset after classification ex: result_dataset") parser.add_argument("--print", required = False ,action='store_true', dest='print_results', help = "") args = parser.parse_args() if args.dataset_dir: vec_model = word2vec.Word2Vec.load_word2vec_format('word2vec_model.txt',binary=False) cls = gl.load_model("graphlab/my_classifier") dataset = gl.load_sframe(args.dataset_dir) result171_dataset = test_classifier(cls,dataset,vec_model) dataset.add_column(result171_dataset.select_column("class"),"class") dataset.add_column(result171_dataset.select_column("probability"),"probability") dataset.save(args.classified_dir) elif args.classified_dir: result171_dataset = gl.load_sframe(args.classified_dir) if args.print_results: print_positives_and_confidence(result171_dataset,result171_dataset)
def main(): parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.") parser.add_argument("--classified_dir", required = False, default=None ,type=str, help = "Directory for dataset after classification ex: result_dataset") parser.add_argument("--print", required = False ,action='store_true',dest="print_results",help = "") parser.add_argument("--pprint", required = False , default=10, type=int ,dest="print_pretty",help = "") args = parser.parse_args() if args.classified_dir: result_dataset = gl.load_sframe(args.classified_dir) save_positive_results_with_event_type_and_date(result_dataset) if args.print_results: pos_results = gl.load_sframe("graphlab/pos_results") sframe = count_monthly(pos_results) sframe.print_rows(sframe.shape[0]) if args.print_pretty < 10: pos_results = gl.load_sframe("graphlab/pos_results") my_dict = get_count_dict(pos_results) if args.print_pretty == 0: print("\n".join(["%d-%d %s" %(year,month, " ".join([str(l) for l in my_dict[year][month]]) ) for year in pos_results['year'].unique().sort() for month in range(1,13) ])) elif args.print_pretty == 1: count_dict = get_norm_dict(pos_results,events = [3,5]) print("\n".join(["%d-%d %.4f %.4f %.4f" %(year,month, my_dict[year][month][3]/count_dict[3][year], my_dict[year][month][5]/count_dict[5][year], sum(my_dict[year][month])/count_dict['total'][year]) for month in range(1,13) for year in pos_results['year'].unique()])) elif args.print_pretty == 2: count_dict = get_norm_dict(pos_results) print("\n".join(["%d-%d %.4f %.4f %.4f %.4f %.4f %.4f %.4f" %(year,month, my_dict[year][month][0]/count_dict[0][year], my_dict[year][month][1]/count_dict[1][year], my_dict[year][month][2]/count_dict[2][year], my_dict[year][month][3]/count_dict[3][year], my_dict[year][month][4]/count_dict[4][year], my_dict[year][month][5]/count_dict[5][year], sum(my_dict[year][month])/count_dict['total'][year]) for month in range(1,13) for year in pos_results['year'].unique().sort()]))
def predict(self, location, age, search_over, n=3): # Load required models and data regression_model = gl.load_model("./regression_model_file/") book_data = gl.load_sframe("./book_data_clean/") implicit_data = gl.load_sframe("./implicit_rating_data/") book_data.filter_by(implicit_data["book_id"], "book_id") # Select approx (search_over) books by splitting data RANDOMLY split = search_over/45000.0 book_data, other_data = book_data.random_split(split) predicted_ratings = [] count = 0 for book in book_data: if count == search_over: break count += 1 book["location"] = location book["age"] = age rating = regression_model.predict(book)[0] if rating >= 8.0: predicted_ratings.append((book["book_id"], rating)) predicted_ratings = sorted(predicted_ratings, key=itemgetter(1), reverse=True) # Recommeded books in decresing values of ratings recommended_books_id = [] for i in range(5): recommended_books_id.append(predicted_ratings[i][0]) recommended_books = [] for book in recommended_books_id: for item in book_data: if book in item["book_id"]: del(item["book_id"]) recommended_books.append(item) break return recommended_books[0:n], recommended_books_id[0:n]
def __init__(self, sf_path=None, g_path=None, cache_max=0.75): self.sf = None self.label = None self.bin_sfs = None self.reps = gl.SArray(dtype=str) self.hier_graph = None self.num_bins = 0 self.features = None self.distance = None self.cache_max = cache_max if g_path: self.g = gl.load_sgraph(g_path) self.sf = self.g.vertices elif sf_path: self.sf = gl.load_sframe(sf_path)
def test_exception(self): bad_url = "hdfs:///root/" if self.has_hdfs: self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile)) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model")) else: logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
def test(): sf = gl.load_sframe('sydney_processed') label = 'mongo_id' # Use 1% of the data. sf = sf.sample(0.5) # Run the algorithm nnh = NNGraphHierarchy() radius = nnh.find_radius(sf, label=label, z_val=1.) nnh.radius = radius nnh.fit(sf, label=label, split_column='time', num_bins=150) nnh.sf.save('final/final_results') accuracy_report(nnh.sf) print 'Success!' exit()
def get_wines_for_movie(movie): path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf' path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf' wine_rec = gl.load_model(path_to_wine) movies_sf = gl.load_sframe(path_to_movies) cols = movies_sf.column_names() movies_df = movies_sf.to_dataframe() ids = [i for i in movies_df.index] movies_df.insert(0, 'id', ids) value_vars = [x for x in movies_df.columns if x != 'id'] movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value') movie_pos = movie_order_dict[movie] sims = pairwise_distances(np.array(movies_rec.coefficients['variable']['factors'])[movie_pos].reshape(1,-1), np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8], metric='cosine') wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name']) return wine_names[np.argsort(sims[0])[::-1]][:5]
def test_exception(self): if self.has_s3: bad_bucket = "i_am_a_bad_bucket" prefix = "s3://" + bad_bucket self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent")) self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent")) self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph")) self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model")) self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph")) self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model")) else: logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
def loadNeigh_1(): print 'loadNeighbor 1' zhima_usr = loadZhima() usr_id = zhima_usr['snwb'] #add zhima 1_neighbor for subgraph subgraph_edges = gl.SFrame() sframeFiles = os.listdir(sFrameFolder) for sf in sframeFiles: edgesData = gl.load_sframe(os.path.join(sFrameFolder, sf)) edgesData.rename({'X1': 'src', 'X2': 'dst'}) zhima_neighbors = edgesData.filter_by(usr_id, 'src') subgraph_edges.append(zhima_neighbors) print sf #save subgraph subgraph_edges.save(os.path.join(resultDataFolder, 'subgraph_zhima_1'))
def save_positive_results_with_event_type_and_date(result_dataset): csvfile = "classification/data/extraction_fields.tsv" with codecs.open(csvfile,"r",encoding="utf8") as infile: lines = infile.readlines() #types = set() #for line in lines[5:]: # types.add(line.split(",")[2].strip().lower()) sf = gl.load_sframe("graphlab/my_training_dataset") # lines[-1].split("\t") = [u'620', u'E1', u'protest', u'', u'', u'T2', u'NGO' ...] size = int(lines[-1].split("\t")[0]) + 1 # latest news index 620 starts from 0 do 620+1 labels = [0]*size for line in lines: fields = line.split("\t") key = fields[2].strip().lower() if key: ind = int(fields[0].strip()) labels[ind] = types[key] #rel_folder="classification/data/v6/class_rel/" ef = sf.filter_by([1], "rel") # add_arguments(None,rel_folder,1,vec_model) ef['event_type'] = ef['filenames'].apply(lambda p: labels[int(p[1:5])]) # evnt type classifier event_type_cls = gl.classifier.create(ef, target="event_type",features=['vectors','1gram features']) pos_results = result_dataset.filter_by([1], "class") pos_res_res = event_type_cls.classify(pos_results) pos_results.add_column(pos_res_res.select_column("class"),"event_type") pos_results.add_column(pos_res_res.select_column("probability"),"et_probability") pos_results.filter_by([5],"event_type") pos_results['date'] = pos_results['filenames'].apply(lambda x: x[:-5].split('_')) pos_results = pos_results.unpack('date') pos_results.rename({'date.0':'year', 'date.1':'month','date.2':'day', 'date.3':'index'}) pos_results['year'] = pos_results['year'].apply(lambda year_str : int(year_str)) pos_results['month'] = pos_results['month'].apply(lambda m_str : int(m_str)) pos_results.save("graphlab/pos_results") ##_2005")
def createGraph(): zhima_usr = loadZhima() zhima_usr.rename({'snwb': '_id'}) subgraph_edges = gl.load_sframe( os.path.join(resultDataFolder, 'subgraph_zhima_2')) #create graph sub_G = gl.SGraph() sub_G = sub_G.add_edges(edges=subgraph_edges, src_field='src', dst_field='dst') #join label to vertices sub_G.vertices.join(zhima_usr, on='_id', how='left') # sub_G.vertices.head(5) print 'save graph' sub_G.save(os.path.join(resultDataFolder, 'subgraph_zhima'))
def _test_save_load_object_helper(testcase, obj, url): """ Helper function to test save and load a server side object to a given url. """ def cleanup(url): """ Remove the saved file from temp directory. """ protocol = None path = None splits = url.split("://") if len(splits) > 1: protocol = splits[0] path = splits[1] else: path = url if not protocol or protocol is "local" or protocol is "remote": tempdir = tempfile.gettempdir() pattern = path + ".*" for f in os.listdir(tempdir): if re.search(pattern, f): os.remove(os.path.join(tempdir, f)) if isinstance(obj, graphlab.SGraph): obj.save(url + ".graph") newobj = graphlab.load_graph(url + ".graph") testcase.assertItemsEqual(obj.get_fields(), newobj.get_fields()) testcase.assertDictEqual(obj.summary(), newobj.summary()) elif isinstance(obj, graphlab.Model): obj.save(url + ".model") newobj = graphlab.load_model(url + ".model") testcase.assertItemsEqual(obj.list_fields(), newobj.list_fields()) testcase.assertEqual(type(obj), type(newobj)) elif isinstance(obj, graphlab.SFrame): obj.save(url + ".frame_idx") newobj = graphlab.load_sframe(url + ".frame_idx") testcase.assertEqual(obj.shape, newobj.shape) testcase.assertEqual(obj.column_names(), newobj.column_names()) testcase.assertEqual(obj.column_types(), newobj.column_types()) assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(), newobj.head(newobj.num_rows()).to_dataframe()) else: raise TypeError cleanup(url)
def _get_frame(self, fname, url): if os.path.isdir(self.folder + fname + '.gl'): return gl.load_sframe(self.folder + fname + '.gl') else: if fname.endswith('.gz') and os.path.isfile(self.folder + fname[:-3]): frame = gl.SFrame.read_csv(self.folder + fname[:-3], delimiter='\t') elif os.path.isfile(self.folder + fname): frame = gl.SFrame.read_csv(self.folder + fname, delimiter='\t') else: urllib2.urlopen(url) print 'Downloading data from STITCH:', fname with file(self.folder + fname, 'wb') as f: f.write(urllib2.urlopen(url).read()) frame = gl.SFrame.read_csv(self.folder + fname, delimiter='\t') os.remove(self.folder + fname) frame.save(self.folder + fname + '.gl') return frame
def get_wine_recs(ratings): path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf' path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf' wine_rec = gl.load_model(path_to_wine) movies_sf = gl.load_sframe(path_to_movies) movies_df = movies_sf.to_dataframe() value_vars = [x for x in movies_df.columns if x != 'id'] new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings} new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1, np.nan) movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True) ids = [i for i in movies_df.index] movies_df.insert(0, 'id', ids) movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna() movies_rec = gl.factorization_recommender.create(movies_melted, user_id = 'id', item_id='variable', target='value') movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec) wine_item_factors = np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8] wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name']) comb = np.dot(np.array(movies_user_factors[-1]), wine_item_factors.T) return wine_names[np.argsort(comb)[::-1]]
def test_graphlab_classifier(self): """ Test the graphlab clasifier from file :return: """ this_dir, _ = os.path.split(os.path.abspath(__file__)) this_dir = os.path.abspath(this_dir) model_path = os.path.join(this_dir, 'data', 'gl_mdl') model = GraphLabClassifierFromFile(model_path) self.assertEqual(model._model.name(), 'NeuralNetClassifier') x = gl.load_sframe(os.path.join(this_dir, 'data', 'img_10')) pred = model.predict(x) self.assertEqual(len(pred), 10) pred_prob = model.predict_proba(x) rows, cols = pred_prob.shape self.assertEqual(rows, 10 * 7) self.assertEqual(cols, 3)
def load_gl_object(filename): """ Load a GLC datastructure from a filename. Parameters ---------- filename : Filename for the archive Returns ---------- The GLC object. """ obj = None if not os.path.exists(filename): raise "Loading error: %s is not a valid filename." % filename try: obj = _gl.load_sframe(filename) return obj except: pass try: obj = _gl.load_sgraph(filename) return obj except: pass try: obj = _gl.load_model(filename) return obj except: pass try: obj = _gl.SArray(data=filename) return obj except: pass return obj
def __init__(self, dataframe): '''Must be run in a python 2 environment. INPUT: Cleaned and preprocessed pandas dataframe''' self.dataframe = dataframe #Graphlab LDA needs a bag of words dictionary for each document in the dataset. self.dataframe['bow'] = dataframe.ttl_ctxt.apply( lambda x: dict(Counter(x.split()))) #Graphlab also require a sframe object self.sframe = gl.load_sframe(dataframe) self.bow = self.sframe['bow'] def topic_modelling(self, n_topics, n_iterations): #Train Graphlab topic model topic_model = gl.topic_model.create(self.bows, num_topics, num_iterations) return topic_model def lda_vis(self, topic_model): #Visualize graphlab topic model plt.figure() pyLDAvis.graphlab.prepare(topic_model, self.bows) plt.show()
if not isinstance(s3_bucket, boto.s3.bucket.Bucket): s3_bucket = S3Connection().get_bucket(s3_bucket) for _id, url in id_url_pairs: results.append(get_source(s3_bucket, s3_save_path, _id, url)) return results # Divvy up a list of items as evenly as possible into n lists def divvy(items, n): q, r = divmod(len(items), n) indices = [q * i + min(i, r) for i in xrange(n + 1)] return [items[indices[i]:indices[i + 1]] for i in xrange(n)] # Load Hacker News metadata SFrame from S3 stories_sf = gl.load_sframe("s3://dato-datasets/hacker_news/stories.sframe") # Get a list of ID, URL pairs from SFrame id_url_pairs = [(x["id"], x["url"]) for x in stories_sf if x["url"]] # Divvy the list of ID, URL pairs from above and pass to n=4 workers chunks = divvy(id_url_pairs, 4) # The S3 bucket and path to where source articles are to be stored in S3 # Set this to a bucket to which you have write access s3_bucket = "my-bucket" s3_save_path = "hacker_news/source_html" # Specify EC2 execution environment # The 2nd parameter should be set to an S3 bucket to which you will write logs ec2 = gl.deploy.environment.EC2("ec2", "s3://my-bucket/logs")
cursor = conn.cursor() pcursor = conn2.cursor() # # # cursor.execute("select owner_id,name,language,forked_from from projects where id=5") # # r=cursor.fetchone() # print r[0] # sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) # m = graphlab.recommender.create(sf, target='rating') # model=gl.load_model('models/watcher_item') # print 'loaded model',time() sf = gl.load_sframe('csv/data') pdata = gl.load_sframe('csv/project_data') pdata = gl.SFrame({ 'item_id': pdata['item_id'], 'language': pdata['language'], 'name': pdata['name'], 'description': pdata['description'] }) # print sf['user_id'] model = gl.recommender.ranking_factorization_recommender.create( sf, target='rating', item_data=pdata) model.save('models/pdata_max') # model=gl.load_model('models/rfm_rating') while True: try:
def __init__(self): self.dbCon = None self.department_dist = {"departments": [], "orders": []} self.item_similarity_top_k = gl.load_sframe( '/home/ec2-user/insta/data/item_similarity_top_5_model')
def load(name): return gl.load_sframe('data/%s_train.sframe' % name), \ gl.load_sframe('data/%s_test.sframe' % name)
import graphlab import pandas as pd binary_sf = graphlab.load_sframe('data/presence_absence.sframe/') proportional_sf = graphlab.load_sframe('data/survey_proportion.sframe/') proportional_df = proportional_sf.to_dataframe() proportional_lookup_table = proportional_df.pivot_table(values= '%present', index='site', columns='taxa', fill_value=0.) binary_df = binary_sf.to_dataframe() binary_lookup_table = binary_df.pivot_table(values= 'present', index='site', columns='taxa', fill_value=0)
# else: # dtypes.pop() # chunk = gl.SFrame.read_csv(file_name, header=True, verbose=True, column_type_hints=dtypes) # # whole = gl.SFrame(chunk) # else: # chunk = gl.SFrame.read_csv(file_name, header=True, verbose=True, column_type_hints=dtypes) # whole = whole.append(chunk) # print whole.shape # # whole.save('~/datasets/Springleaf/' + prefix +'_binary') # unmatches = [i for i, j in zip(dtypes, tocheck) if i != j] #' '.join(data.column_names()) data = gl.load_sframe("~/datasets/Springleaf/train_binary") test = gl.load_sframe("~/datasets/Springleaf/test_binary") # row & column count print "training data has " + str(data.shape[0]) + " rows and " + str(data.shape[1]) + " columns" print "test data has " + str(test.shape[0]) + " rows and " + str(test.shape[1]) + " columns" # Pre-process data ## add the target columns to test data as 9999 # test['target'] = 9999 # ## combine the training and test datasets for data preprocessing # combined = data.append(test) counter = 0 for i in data.column_names():
import graphlab as gl from graphlab import mxnet as mx import os ##mx.pretrained_model.download_model('https://static.turi.com/models/mxnet_models/release/image_classifier/imagenet1k_inception_bn-1.0.tar.gz') mx.pretrained_model.list_models() image_classifier = mx.pretrained_model.load_model('imagenet1k_inception_bn', ctx=mx.gpu(0)) # Load image data into SFrame data_file = 'cats_dogs_sf' if os.path.exists(data_file): sf = gl.load_sframe(data_file) else: url = 'https://static.turi.com/datasets/' + data_file sf = gl.load_sframe(url) sf.save(data_file) # Predict using the pretrained image classifier prediction = image_classifier.predict_topk(sf['image'], k=1) # Extract features from images features = image_classifier.extract_features(sf['image'])
dog_info['dpt'] = dog_data[i]['refuge_name'][:2] dog_info['image_filename'] = dog_data[i]['image_filename'] dog_info['name'] = dog_data[i]['name'] dog_info['url'] = dog_data[i]['url'] dog_json['dog' + str(i)] = dog_info response['response'] = dog_json return response if __name__ == "__main__": parser = argparse.ArgumentParser(description='Name of database') parser.add_argument('db_name') args = parser.parse_args() images = graphlab.load_sframe(path + 'my_images') model = graphlab.load_model(path + 'my_model') client = MongoClient("localhost") db = client.dogos db.dogos_temp.drop() # Issue the serverStatus command and print the results # serverStatusResult=db.command("serverStatus") # pprint(serverStatusResult) num_images = len(images) for dog_id in xrange(num_images): print "Query %d in %d" % (dog_id, num_images) dogo = images[dog_id:dog_id + 1] neighbours = query_model(dogo, model, images)
def main(): #sf = create_frame_from_file('../../Data/data_file_modified.txt') x = gl.load_sframe('py2_ready_for_session') sessions = create_sessions(x) sessions.save('session')