def showPath(self, highlight=None): # with open(self.verticesFn,'a') as Vwr: # with open(self.edgesFn,'a') as Ewr: # for i in range(8): # Vwr.write('\nc0_' + `i` + ', ') # Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c') # highlight['c0_'+`i`] = [0.69, 0.0, 0.498] # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='attributes', vlabel_hover=False, elabel='relation', highlight=highlight, arrows=True) # highLight sleep(20) pass
def create_frame_from_file(file_name): n_total_lines = 220000 sf = SFrame() with open(file_name) as data: dt = [] ip = [] py = [] script = [] id = [] for i, line in enumerate(data): jo = json.loads(line) dt += jo['dt'] ip += jo['ip'] py += jo['py'] id += [i] script += jo['user_script'] if i % 100 == 0: print float(i) / n_total_lines sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.save('python_tutor') return sf
def get_rating_sf(self, samples, save_to=None): sf = SFrame(self.ratings.ix[samples]) sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid]) sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid]) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf
def showPath(self, highlight=None): edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') g.show(vlabel='id', elabel='relation', highlight=highlight, arrows=True) # highLight sleep(10) pass
def showPath(self, highlight=None): start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') end = datetime.datetime.now() print(end - start) # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True) # highLight # sleep(40) pass
def download(symbol, start_date, end_date): stock = Share(symbol) # ^GSPC is the Yahoo finance symbol to refer S&P 500 index # we gather historical quotes from 2001-01-01 up to today hist_quotes = stock.get_historical(start_date, end_date) l_date = [] l_open = [] l_high = [] l_low = [] l_close = [] l_volume = [] # reverse the list hist_quotes.reverse() for quotes in hist_quotes: l_date.append(quotes['Date']) l_open.append(float(quotes['Open'])) l_high.append(float(quotes['High'])) l_low.append(float(quotes['Low'])) l_close.append(float(quotes['Adj_Close'])) l_volume.append(int(quotes['Volume'])) sf = SFrame({ 'datetime': l_date, 'open': l_open, 'high': l_high, 'low': l_low, 'close': l_close, 'volume': l_volume }) # datetime is a string, so convert into datetime object sf['datetime'] = sf['datetime'].apply( lambda x: datetime.strptime(x, '%Y-%m-%d')) return sf
def __init__(self): rec_data = "../data/train.csv" data = ProcessData() df_rec = data.get_data(rec_data) df_rec = data.clean_data(df_rec) df_rec = df_rec[df_rec.record_type == 1] sf = SFrame(data=df_rec) del df_rec # memory optimization self.modelA = recommender.create(sf, user_column="customer_ID", item_column="A") self.modelB = recommender.create(sf, user_column="customer_ID", item_column="B") self.modelC = recommender.create(sf, user_column="customer_ID", item_column="C") self.modelD = recommender.create(sf, user_column="customer_ID", item_column="D") self.modelE = recommender.create(sf, user_column="customer_ID", item_column="E") self.modelF = recommender.create(sf, user_column="customer_ID", item_column="F")
def resize_images(filename): images = graphlab.image_analysis.load_images(filename, format='auto', with_path=False, recursive=False, ignore_failure=True, random_order=True) # firstImages = images[0:9]['image'] new_images = list() new_images.append( graphlab.image_analysis.resize(images['image'], 32, 32, channels=4, decode=True)) frame = SFrame(new_images) frame.save('mini')
def load_graph(self, graph_path, direction=1, start_line=0, limit=None, blacklist=set(), delimiter=','): json_object = utils.is_json(graph_path) if json_object is not False: # print json_object graph_path = SFrame(SArray(json_object).unpack()) graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'}) else: # load_sgraph() graph_path = SFrame.read_csv(graph_path, delimiter=delimiter, header=False, column_type_hints={ 'X1': str, 'X2': str }, nrows=limit, skiprows=start_line) if self._weight_field != "": graph_path.rename({'X3': 'Weight'}) # print graph_data self._graph = self._graph.add_edges(graph_path, src_field='X1', dst_field='X2') if not self.is_directed: self.to_undirected()
def query_model(dogo, model, images): neighbours = get_images_from_ids(model.query(dogo, k=20), images) image_list = SFrame(data=None) shown_dogs = {dogo['images'][0][0]} for i in range(0, len(neighbours)): if len(shown_dogs) < 6: if neighbours[i]['images'][0] not in shown_dogs: # neighbours[i]['image'].show() dogo_clone = neighbours[i:i + 1].copy() image_list = image_list.append(SFrame(dogo_clone)) shown_dogs.add(neighbours[i]['images'][0]) else: break return image_list
def CC(): url = '/home/gengl/Datasets/CC/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') cc_model = connected_components.create(graph, verbose=True) cc_model.summary()
def SSSP(): url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary()
def process_frame(filename): sf = gl.load_sframe(filename) output_frame = SFrame() #Setup our output frame id = [] ip = [] sub_count = [] error_count = [] time_count = [] error_sequence_raw = [] error_sequence = [] #How many session ID's do we have? sa = gl.SArray() sa = sf['session_id'] test = sa.unique() limit = len(test) #Start grabbing each session for i in range(1,limit): #Test output if (i % 100 == 0): break #Get the session and sort it by the date time session_frame = sf.filter_by(i,"session_id") #sorted_session = session_frame.sort("dt") row = sf[0] id += [i] ip += [row['ip']] sub_count += [len(row)] #time_count += [fn_time_count(sorted_session)] #error_count += [fn_error_count(sorted_session)] #error_sequence_raw += [fn_error_sequence_raw(sorted_session)] print len(id) print len(ip) print len(sub_count) #print len(time_count) output_frame = output_frame.add_column(SArray(id), name='id') output_frame.add_column(SArray(ip), name='ip') output_frame.add_column(SArray(sub_count),name='sub_count') #output_frame.add_column(SArray(time_count),name='sub_length') #output_frame.add_column(SArray(error_count),name='error_count') #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw') output_frame.save('py2_session_analysis')
def showPath(self, highlight=None): # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='id', elabel='relation', vlabel_hover=True, highlight=highlight, arrows=True) # highLight sleep(30) pass
def get_vertices(k, factor0=1): return SFrame({ # Movies '__id': mids, 'factors': map(lambda _: rand(k) * factor0, movie_ids), 'w': map(lambda _: np.zeros(ng + nht), movie_ids), 'b': map(lambda _: 0, movie_ids), 'features': movies_features, 'user': map(lambda _: 0, movie_ids) }).append( SFrame({ # User '__id': uids, 'factors': map(lambda _: rand(k) * factor0, user_ids), 'w': map(lambda _: np.zeros(ng + nht), user_ids), 'b': map(lambda _: 0, user_ids), 'features': map(lambda _: {}, user_ids), 'user': map(lambda _: 1, user_ids) }))
def PageRank(): url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.000000000001, max_iterations=42, _distributed=True) pr_model.summary()
def PageRank(): url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.0001, max_iterations=1000, _distributed=True) pr_model.summary()
def fn_time_count(frame): sf = SFrame() sf = frame sa = SArray() sa = sf['dt'] num = sf.num_rows() fmt = '%Y-%m-%d %H:%M:%S' #Convert to date time objects start_dt = datetime.strptime(sa[0],fmt) end_dt = datetime.strptime(sa[num - 1],fmt) #Convert to Unix time stamps start_timestamp = time.mktime(start_dt.timetuple()) end_timestamp = time.mktime(end_dt.timetuple()) #Calculate the difference in minutes difference = (end_timestamp - start_timestamp) / 60 return difference
def train_model(filename): # load already prepared data in form of an SFrame image_train = graphlab.SFrame(filename) # load the pre-trained model loaded_model = graphlab.load_model('model/') # extract features of the model on the given pictures image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train)) # add ids to the SFrame to be able to find the closest images ids = SArray(list(range(0,len(image_train)))) image_train.add_column(ids, name='id') # print image_train.head() # train the NN model on the extracted features knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id') return knn_model, image_train
def fit_model(play_matrix): """Take matrix of play counts and return GraphLab recommender model based on these data INPUT: play_matrix (numpy array): Array of features to fit to OUTPUT: mod (GraphLab model): GraphLab matrix factorisation recommender model """ plays_df = pd.DataFrame(play_matrix) plays_sf = SFrame(plays_df) agg_sf = plays_sf.groupby('user_id', operations={'mean_plays': agg.MEAN('play_count'), 'sd_plays': agg.STDV('play_count'), 'play_quantile': agg.QUANTILE('play_count', [0.2, 0.4, 0.6, 0.8, 1])}) plays_sf = plays_sf.join(agg_sf, on='user_id', how='inner') play_quantiles = np.array(plays_sf['play_quantile']) play_counts = np.array(plays_sf['play_count']) play_counts = play_counts.reshape(play_counts.shape[0], 1) plays_sf['rating'] = np.sum(play_counts <= play_quantiles, axis=1) mod = graphlab.recommender.create(plays_sf, user_id='user_id', item_id='song_title', target='rating', ranking=True) return mod
def SSSP(): url = '/home/gengl/Datasets/SSSP/Google/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary() with open('/home/gengl/sssp_graphlab', 'w') as fo: for vid in range(0, 875713): try: result_pair = sp_model.get_path(vid) fo.write(str(result_pair[-1]) + '\n') except: pass
def graph_lab(url, format = 'auto', flip_img = False, zoom = False): """Extracts the graphlab features""" if format == 'auto': extension = url.split('.')[-1] img = preprocess(url) if flip_img: img = flip(img) if zoom: img = middle(img) h,w,_ = img.shape img_bytes = bytearray(img) image_data_size = len(img_bytes) img = graphlab.Image(_image_data=img_bytes, _width=w, _height=h, _channels=3, _format_enum=2, _image_data_size=image_data_size) return SFrame({'image': [img]})
def test_graphlab(num_factors=10, reg=0.01, niter=50): ''' test the graphlab install ''' url = 'http://s3.amazonaws.com/GraphLab-Datasets/movie_ratings/training_data.csv' data = SFrame(url) mfac = recommender.matrix_factorization.create(data, 'user', 'movie', 'rating', num_factors, reg=reg, nmf=True, use_bias=True, holdout_probability=0.2, niter=niter, random_seed=42) print mfac.summary return mfac
def test_graphlab2(num_factors=10, reg=0.1, niter=50): ''' test the graphlab install with our data''' infile = PARS['data_dir'] + 'subset_partial.csv' data = SFrame(infile) mfac = recommender.matrix_factorization.create(data, 'id', 'brand', 'purchasequantity', num_factors, reg=reg, nmf=True, use_bias=True, holdout_probability=0.2, niter=niter, random_seed=42) print mfac.summary return mfac
def append_images(json_file): # we fill an SFrame with all the given metadata of the dogs meta = SFrame.read_json(json_file, orient='records') # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file image_list = SFrame(data=None) # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line for i in range(0, len(meta) - 1): dogo = meta[i:i + 1] for image in dogo['images'][0]: # print image dogo_clone = dogo.copy() dogo_clone.add_column(SArray([(graphlab.Image(images_path + image)) ]), name='image') dogo_clone.add_column(SArray([image]), name='image_filename') image_list = image_list.append(SFrame(dogo_clone)) image_list.save(filename='prepared_data/')
def data_frame_with_target(self, data_frame): """ :param data_frame: :type data_frame: DataFrame :return: :rtype: SFrame """ data_sframe = SFrame(data_frame.toPandas()) sentiment_array = data_sframe.select_column('sentiment') target_array = [] for x in sentiment_array: try: target_array.append(self.convert_func(x)) except Exception as ex: print len(target_array), 'get_sentiments', x target_array.append(3) print ex data_sframe.add_column(SArray(target_array, dtype=int), name='target') print data_sframe return data_sframe.dropna()
import graphlab as gl import datetime # Create cluster c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3) print c from graphlab import SFrame, SGraph url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt' data = SFrame.read_csv(url, delimiter='\t',header=False) g = SGraph().add_edges(data, src_field='X2', dst_field='X1') # triangle counting from graphlab import triangle_counting tc = triangle_counting.create(g) tc_out = tc['triangle_count'] #pagerank from graphlab import pagerank datetime.datetime.now() pr = pagerank.create(g,threshold=0.001) datetime.datetime.now() # Connected Components from graphlab import connected_components datetime.datetime.now() cc = connected_components.create(g) datetime.datetime.now()
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', # src_field='src', dst_field='dst') #targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst') #print g g.show()
def create_sessions(sf=SFrame()): assert(type(sf) == SFrame) ip = [] user_script = [] err_msg = [] compile_err = [] of_interest = [] ignored = 0 for i in xrange(len(sf)): count = sf['count'][i] if count != len(sf['id'][i]): ignored += 1 continue tip = sf['ip'][i] chunk_user_script = cut_dict_by_dt(sf['user_script'][i]) user_script += chunk_user_script ip += [tip,] * len(chunk_user_script) err_msg += cut_dict_by_dt(sf['err_msg'][i]) chunk_compile_err = cut_dict_by_dt(sf['compile_err'][i]) compile_err += chunk_compile_err of_interest += set_of_interest_bit(chunk_compile_err) print "DEBUG:", "ignored:", ignored rst = SFrame() rst.add_column(SArray(ip, dtype=str), name='ip') rst.add_column(SArray(user_script, dtype=dict), name='user_script') rst.add_column(SArray(err_msg, dtype=dict), name='err_msg') rst.add_column(SArray(compile_err, dtype=dict), name='compile_err') rst.add_column(SArray(of_interest, dtype=int), name='of_interest') return rst
def main(): with open('../../Data/data_file_modified.txt') as data: sf = SFrame() # Data model format # RecordID | Date/Time | IP Address | Python Version | # User Script | Compile Flag | Compile Message id = [] dt = [] ip = [] py = [] script = [] error = [] error_msg = [] for i, line in enumerate(data): jo = json.loads(line) # Two different version of Python script # need to be compiled on different interpreters if(jo['py'][0] == 3): # Setup the data model we're using id += [i] dt += jo['dt'] ip += jo['ip'] py += jo['py'] script += jo['user_script'] # Run the script on the compile method # and obtain any error message flag = False msg = "" pattern = "is local and global" try: compile(jo['user_script'][0],'<string>','exec') except SyntaxError, e: if(re.search(pattern, str(e))): msg = "Variable is Local and Global" else: msg = str(e) flag = True if(flag): error += [1] else: error += [0] # We need to chop off the error type # and remove the (filename line number) # to have any meaning here. fix_msg = msg.partition('(')[0] error_msg += [fix_msg.strip()] sf = sf.add_column(SArray(id), name='id') sf.add_column(SArray(dt), name='dt') sf.add_column(SArray(ip), name='ip') sf.add_column(SArray(py, dtype=str), name='py') sf.add_column(SArray(script), name='user_script') sf.add_column(SArray(error), name='compile_err') sf.add_column(SArray(error_msg), name='err_msg') sf.save('py3_error_frame_clean')
verbose = False vertexFiles = [ "City", "Country", "Region", "Advisor", "Category", "Founder", "FundingRound", "HQ", "keywords", "Member", "Office", "organizations", "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap" ] edgesFiles = [ "GeoInformation", "acquisitions", "categories_keywords_edges", "investments", "keywords_descriptions_edges", "keywords_webpages_edges", "relationships", "companies_acquired_by_sap_edges" ] g = SGraph() for f in vertexFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'path' in content.column_names(): g = g.add_vertices(content, vid_field='path') elif 'url' in content.column_names(): g = g.add_vertices(content, vid_field='url') else: print "Unknown vid field: ", content.column_names() sys.exit() for f in edgesFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'src' in content.column_names() and 'dst' in content.column_names(): g = g.add_edges(content, src_field='src', dst_field='dst')
# ------- import pandas as pd from graphlab import SFrame from graphlab import popularity_recommender as pr # ----------- # Data Import # ----------- col_names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_table('u.data', names=col_names) data = data.drop('timestamp', 1) # ---------- # Data Split # ---------- sf = SFrame(data=data, format='auto') train, test = sf.random_split(.7) print(len(train)) print(len(test)) # ---------------------- # Popularity Recommender # ---------------------- recommender = pr.create(train, target='rating') eval = recommender.evaluate(test) # ('\nOverall RMSE: ', 1.0262364153594763)
def read_columns_to_sframe(self, column_map): from graphlab import SFrame sf = SFrame.read_csv(csv_datasource_path(self.name)) return sf.rename(invert_dict(column_map))
def get_sf_from_coo(coo, save_to): sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data}) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf
song_3a=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][0]][0], song_3b=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][1]][0], artist_1a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][0]][0], artist_1b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][1]][0], artist_2a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][0]][0], artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][1]][0], artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][0]][0], artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][1]][0]) # Pass song names/titles to choose from to HTML template if __name__ == '__main__': play_count_path = 'train_triplets.txt' feat_path = 'trimmed_tempos.csv' model_path = 'full_two_hour_mod_directory' feat_mat = SFrame.read_csv(feat_path) model = graphlab.load_model(model_path) song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3) pref_songs = get_user_prefs(feat_mat, song_pairs) playlist = get_playlist(model, pref_songs, feat_mat, desired_tempo=160, tempo_margin=10, playlist_length=10) songs = playlist['song_name'].values artists = playlist['artist'].values tempoxs = playlist['tempo_multiplier'].values cadences = playlist['effective_tempo'].values pl_length = 10 timeline_obj = ""
# import pandas as pd import numpy as np import os import graphlab as gl from graphlab import SFrame import pandas as pd def merge_data(df): return ''.join([str(df["store_nbr"]), "_", str(df["item_nbr"]), "_", df["date"]]) ind = True weather = SFrame.read_csv(os.path.join('..', "data", "weather_modified_3.csv")) if ind: test = SFrame.read_csv(os.path.join('..', "data", "test.csv")) train = SFrame.read_csv(os.path.join('..', "data", "train.csv")) key = SFrame.read_csv(os.path.join('..', "data", "key.csv")) zero_items = SFrame.read_csv(os.path.join('..', 'data', 'zero_items_solid_new.csv')) train_new = train.join(zero_items) if ind: test_new = test.join(zero_items)
[0]][0], artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1] [1]][0], artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2] [0]][0], artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2] [1]][0]) # Pass song names/titles to choose from to HTML template if __name__ == '__main__': play_count_path = 'train_triplets.txt' feat_path = 'trimmed_tempos.csv' model_path = 'full_two_hour_mod_directory' feat_mat = SFrame.read_csv(feat_path) model = graphlab.load_model(model_path) song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3) pref_songs = get_user_prefs(feat_mat, song_pairs) playlist = get_playlist(model, pref_songs, feat_mat, desired_tempo=160, tempo_margin=10, playlist_length=10) songs = playlist['song_name'].values artists = playlist['artist'].values tempoxs = playlist['tempo_multiplier'].values cadences = playlist['effective_tempo'].values pl_length = 10
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str}) cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename') print g.summary() return g
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g
outputPath = os.environ.get("OUTPUT_PATH") startScale = int(os.environ.get("START_SCALE")) tagFile = './tmp' with open(tagFile, 'r') as f: infor = f.readline().strip().split(",") maxScale = int(infor[1]) realEndScale = int(infor[2]) scaleRange = range(startScale, realEndScale + 1) for scale in scaleRange: inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships', str(scale)) url = inputPath data = SFrame.read_csv(url, header=False) if (data.num_rows() == 0): cc_ids = SFrame({"__id": [], "component_id": []}) else: g = SGraph().add_edges(data, src_field=data.column_names()[0], dst_field=data.column_names()[1]) cc = connected_components.create(g) cc_ids = cc.get('component_id') path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale)) if (~os.path.exists(path)): os.makedirs(path) SFrame.export_csv(cc_ids, os.path.join(path))
import graphlab from graphlab import SFrame train_input = graphlab.image_analysis.load_images('train_images/', "auto", with_path=False, random_order=False) train_output = SFrame.read_csv('train_outputs.csv',delimiter=',', header=True, column_type_hints=[int,int]) train_output.rename({'Prediction':'label'}) train_output.remove_column('Id') train_output.add_column(train_input.select_column('image'),name='image') training_data, validation_data = train_output.random_split(0.8) training_data['image'] = graphlab.image_analysis.resize(training_data['image'], 28, 28, 1) validation_data['image'] = graphlab.image_analysis.resize(validation_data['image'], 28, 28, 1) mnist_net = graphlab.deeplearning.get_builtin_neuralnet('mnist') #net = graphlab.deeplearning.create(sf, target='Prediction') m = graphlab.neuralnet_classifier.create(training_data, target='label', network = mnist_net, validation_set=validation_data, max_iterations=200) #test_data = graphlab.image_analysis.load_images('test_images/', "auto", with_path=False, random_order=False) #pred = m.classify(test_data)
def process_frame(frame_name): #Setup columns for the new frame session_id = [] ip_address = [] python_version = [] interest = [] submissions = [] #Load in the frame we're processing frame = gl.load_sframe(frame_name) #Sort the frame by IP and then DT ASC sorted_frame = frame.sort(['ip','dt']) #Previous IP to see if we're looking at a new IP address previous_ip = 0 previous_py = 0 #Counters (for keys) record_counter = 1 submission_counter = 1 #Dictionary to hold submissions submissions_collection = {} #Looping through all records to break this up into #ip address and then 'session' chunks for i in xrange(len(sorted_frame)): if(i == 1): print sorted_frame['ip'][i] break; if(i % 100 == 0): print "processing record:" + str(i) if((sorted_frame['ip'][i] != previous_ip)): if(previous_ip != 0): #Add in the record to the frame session_id += str(record_counter) ip_address += str(previous_ip) python_version += str(previous_py) interest += str(is_interesting(submissions_collection)) submissions += submissions_collection #Reset all values submissions_collection = {} previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] record_counter = record_counter + 1 submission_counter = 1 #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Handling the very first record previous_ip = sorted_frame['ip'][i] previous_py = sorted_frame['py'][i] #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 else: #Create and append the submission d = {} d['date-time'] = sorted_frame['dt'][i] d['code_segment'] = sorted_frame['user_script'][i] d['error_message'] = sorted_frame['err_msg'][i] d['error_flag'] = sorted_frame['compile_err'][i] submissions_collection[str(submission_counter)] = d submission_counter = submission_counter + 1 #Finally, create the frame and save it! print ip_address print len(session_id) print len(ip_address) print len(python_version) print len(submissions) rst = SFrame() rst.add_column(SArray(session_id, dtype=str), name='session_id') rst.add_column(SArray(ip_address, dtype=str), name='ip_address') rst.add_column(SArray(python_version, dtype=str), name='python_version') rst.add_column(SArray(submissions, dtype=dict), name='submissions') rst.save("test_frame")
#targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') #print g g.show()