def __init__(self): rec_data = "../data/train.csv" data = ProcessData() df_rec = data.get_data(rec_data) df_rec = data.clean_data(df_rec) df_rec = df_rec[df_rec.record_type == 1] sf = SFrame(data=df_rec) del df_rec # memory optimization self.modelA = recommender.create(sf, user_column="customer_ID", item_column="A") self.modelB = recommender.create(sf, user_column="customer_ID", item_column="B") self.modelC = recommender.create(sf, user_column="customer_ID", item_column="C") self.modelD = recommender.create(sf, user_column="customer_ID", item_column="D") self.modelE = recommender.create(sf, user_column="customer_ID", item_column="E") self.modelF = recommender.create(sf, user_column="customer_ID", item_column="F")
def load_graph(self, graph_path, direction=1, start_line=0, limit=None, blacklist=set(), delimiter=','): json_object = utils.is_json(graph_path) if json_object is not False: # print json_object graph_path = SFrame(SArray(json_object).unpack()) graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'}) else: # load_sgraph() graph_path = SFrame.read_csv(graph_path, delimiter=delimiter, header=False, column_type_hints={ 'X1': str, 'X2': str }, nrows=limit, skiprows=start_line) if self._weight_field != "": graph_path.rename({'X3': 'Weight'}) # print graph_data self._graph = self._graph.add_edges(graph_path, src_field='X1', dst_field='X2') if not self.is_directed: self.to_undirected()
def download(symbol, start_date, end_date): stock = Share(symbol) # ^GSPC is the Yahoo finance symbol to refer S&P 500 index # we gather historical quotes from 2001-01-01 up to today hist_quotes = stock.get_historical(start_date, end_date) l_date = [] l_open = [] l_high = [] l_low = [] l_close = [] l_volume = [] # reverse the list hist_quotes.reverse() for quotes in hist_quotes: l_date.append(quotes['Date']) l_open.append(float(quotes['Open'])) l_high.append(float(quotes['High'])) l_low.append(float(quotes['Low'])) l_close.append(float(quotes['Adj_Close'])) l_volume.append(int(quotes['Volume'])) sf = SFrame({ 'datetime': l_date, 'open': l_open, 'high': l_high, 'low': l_low, 'close': l_close, 'volume': l_volume }) # datetime is a string, so convert into datetime object sf['datetime'] = sf['datetime'].apply( lambda x: datetime.strptime(x, '%Y-%m-%d')) return sf
def get_rating_sf(self, samples, save_to=None): sf = SFrame(self.ratings.ix[samples]) sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid]) sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid]) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf
def query_model(dogo, model, images): neighbours = get_images_from_ids(model.query(dogo, k=20), images) image_list = SFrame(data=None) shown_dogs = {dogo['images'][0][0]} for i in range(0, len(neighbours)): if len(shown_dogs) < 6: if neighbours[i]['images'][0] not in shown_dogs: # neighbours[i]['image'].show() dogo_clone = neighbours[i:i + 1].copy() image_list = image_list.append(SFrame(dogo_clone)) shown_dogs.add(neighbours[i]['images'][0]) else: break return image_list
def append_images(json_file): # we fill an SFrame with all the given metadata of the dogs meta = SFrame.read_json(json_file, orient='records') # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file image_list = SFrame(data=None) # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line for i in range(0, len(meta) - 1): dogo = meta[i:i + 1] for image in dogo['images'][0]: # print image dogo_clone = dogo.copy() dogo_clone.add_column(SArray([(graphlab.Image(images_path + image)) ]), name='image') dogo_clone.add_column(SArray([image]), name='image_filename') image_list = image_list.append(SFrame(dogo_clone)) image_list.save(filename='prepared_data/')
def get_vertices(k, factor0=1): return SFrame({ # Movies '__id': mids, 'factors': map(lambda _: rand(k) * factor0, movie_ids), 'w': map(lambda _: np.zeros(ng + nht), movie_ids), 'b': map(lambda _: 0, movie_ids), 'features': movies_features, 'user': map(lambda _: 0, movie_ids) }).append( SFrame({ # User '__id': uids, 'factors': map(lambda _: rand(k) * factor0, user_ids), 'w': map(lambda _: np.zeros(ng + nht), user_ids), 'b': map(lambda _: 0, user_ids), 'features': map(lambda _: {}, user_ids), 'user': map(lambda _: 1, user_ids) }))
def train_model(filename): # load already prepared data in form of an SFrame image_train = graphlab.SFrame(filename) # load the pre-trained model loaded_model = graphlab.load_model('model/') # extract features of the model on the given pictures image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train)) # add ids to the SFrame to be able to find the closest images ids = SArray(list(range(0,len(image_train)))) image_train.add_column(ids, name='id') # print image_train.head() # train the NN model on the extracted features knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id') return knn_model, image_train
def test_graphlab(num_factors=10, reg=0.01, niter=50): ''' test the graphlab install ''' url = 'http://s3.amazonaws.com/GraphLab-Datasets/movie_ratings/training_data.csv' data = SFrame(url) mfac = recommender.matrix_factorization.create(data, 'user', 'movie', 'rating', num_factors, reg=reg, nmf=True, use_bias=True, holdout_probability=0.2, niter=niter, random_seed=42) print mfac.summary return mfac
def graph_lab(url, format = 'auto', flip_img = False, zoom = False): """Extracts the graphlab features""" if format == 'auto': extension = url.split('.')[-1] img = preprocess(url) if flip_img: img = flip(img) if zoom: img = middle(img) h,w,_ = img.shape img_bytes = bytearray(img) image_data_size = len(img_bytes) img = graphlab.Image(_image_data=img_bytes, _width=w, _height=h, _channels=3, _format_enum=2, _image_data_size=image_data_size) return SFrame({'image': [img]})
def resize_images(filename): images = graphlab.image_analysis.load_images(filename, format='auto', with_path=False, recursive=False, ignore_failure=True, random_order=True) # firstImages = images[0:9]['image'] new_images = list() new_images.append( graphlab.image_analysis.resize(images['image'], 32, 32, channels=4, decode=True)) frame = SFrame(new_images) frame.save('mini')
def test_graphlab2(num_factors=10, reg=0.1, niter=50): ''' test the graphlab install with our data''' infile = PARS['data_dir'] + 'subset_partial.csv' data = SFrame(infile) mfac = recommender.matrix_factorization.create(data, 'id', 'brand', 'purchasequantity', num_factors, reg=reg, nmf=True, use_bias=True, holdout_probability=0.2, niter=niter, random_seed=42) print mfac.summary return mfac
def data_frame_with_target(self, data_frame): """ :param data_frame: :type data_frame: DataFrame :return: :rtype: SFrame """ data_sframe = SFrame(data_frame.toPandas()) sentiment_array = data_sframe.select_column('sentiment') target_array = [] for x in sentiment_array: try: target_array.append(self.convert_func(x)) except Exception as ex: print len(target_array), 'get_sentiments', x target_array.append(3) print ex data_sframe.add_column(SArray(target_array, dtype=int), name='target') print data_sframe return data_sframe.dropna()
# ------- import pandas as pd from graphlab import SFrame from graphlab import popularity_recommender as pr # ----------- # Data Import # ----------- col_names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_table('u.data', names=col_names) data = data.drop('timestamp', 1) # ---------- # Data Split # ---------- sf = SFrame(data=data, format='auto') train, test = sf.random_split(.7) print(len(train)) print(len(test)) # ---------------------- # Popularity Recommender # ---------------------- recommender = pr.create(train, target='rating') eval = recommender.evaluate(test) # ('\nOverall RMSE: ', 1.0262364153594763)
outputPath = os.environ.get("OUTPUT_PATH") startScale = int(os.environ.get("START_SCALE")) tagFile = './tmp' with open(tagFile, 'r') as f: infor = f.readline().strip().split(",") maxScale = int(infor[1]) realEndScale = int(infor[2]) scaleRange = range(startScale, realEndScale + 1) for scale in scaleRange: inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships', str(scale)) url = inputPath data = SFrame.read_csv(url, header=False) if (data.num_rows() == 0): cc_ids = SFrame({"__id": [], "component_id": []}) else: g = SGraph().add_edges(data, src_field=data.column_names()[0], dst_field=data.column_names()[1]) cc = connected_components.create(g) cc_ids = cc.get('component_id') path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale)) if (~os.path.exists(path)): os.makedirs(path) SFrame.export_csv(cc_ids, os.path.join(path))
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g
def get_sf_from_coo(coo, save_to): sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data}) if save_to is not None: print "saving sframe to", save_to sf.save(save_to) return sf