def get_subgraph(self, ids, radius=1, full_subgraph=True): verts = ids # find the vertices within radius (and the path edges) for i in range(radius): edges_out = self._graph.get_edges(src_ids=verts) # edges_in = self._graph.get_edges(dst_ids=verts) verts = list(edges_out['__src_id']) + list(edges_out['__dst_id']) verts = list(set(verts)) # make a new graph to return and add the vertices g = SGraph() g = g.add_vertices(self._graph.get_vertices(verts), vid_field='__id') # add the requested edge set if full_subgraph is True: df_induced = self._graph.get_edges(src_ids=verts) # induced_edge_in = self._graph.get_edges(dst_ids=verts) # df_induced = induced_edge_out.append(induced_edge_in) df_induced = df_induced.groupby(df_induced.column_names(), {}) verts_sa = SArray(list(ids)) edges = df_induced.filter_by(verts_sa, "__src_id") edges.append(df_induced.filter_by(verts_sa, "__dst_id")) g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id') return GlGraph(is_directed=self.is_directed, graph_obj=g)
def load_graph(self, graph_path, direction=1, start_line=0, limit=None, blacklist=set(), delimiter=','): json_object = utils.is_json(graph_path) if json_object is not False: # print json_object graph_path = SFrame(SArray(json_object).unpack()) graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'}) else: # load_sgraph() graph_path = SFrame.read_csv(graph_path, delimiter=delimiter, header=False, column_type_hints={ 'X1': str, 'X2': str }, nrows=limit, skiprows=start_line) if self._weight_field != "": graph_path.rename({'X3': 'Weight'}) # print graph_data self._graph = self._graph.add_edges(graph_path, src_field='X1', dst_field='X2') if not self.is_directed: self.to_undirected()
def append_images(json_file): # we fill an SFrame with all the given metadata of the dogs meta = SFrame.read_json(json_file, orient='records') # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file image_list = SFrame(data=None) # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line for i in range(0, len(meta) - 1): dogo = meta[i:i + 1] for image in dogo['images'][0]: # print image dogo_clone = dogo.copy() dogo_clone.add_column(SArray([(graphlab.Image(images_path + image)) ]), name='image') dogo_clone.add_column(SArray([image]), name='image_filename') image_list = image_list.append(SFrame(dogo_clone)) image_list.save(filename='prepared_data/')
def initVertex(g): g.vertices['dist'] = 8888 g.vertices['sent'] = 0 #g.vertices['from_last_art'] = 0 #g.vertices['count'] =0 g.vertices['isDead'] = 0 #g.vertices['vid_set'] = SArray.from_const({}, g.summary()['num_vertices']) #seen here have two function, for the cat, it is used to remember the articles, for art, it is used as the vid_set #in fact, it is a dict with the form of {'id':[dist, from_last_art]} g.vertices['seen'] = SArray.from_const({}, g.summary()['num_vertices'])
def train_model(filename): # load already prepared data in form of an SFrame image_train = graphlab.SFrame(filename) # load the pre-trained model loaded_model = graphlab.load_model('model/') # extract features of the model on the given pictures image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train)) # add ids to the SFrame to be able to find the closest images ids = SArray(list(range(0,len(image_train)))) image_train.add_column(ids, name='id') # print image_train.head() # train the NN model on the extracted features knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id') return knn_model, image_train
def store(self): # build as sparse matrix first self.sa = SArray(dtype=dict) for i, group in enumerate(self.selection): #if self.selection and i not in self.selection: #if self.selection and group not in self.selection: # continue counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group) hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group) c = loader.create_list(counts_file,False) h = loader.create_list(hashes_file,True) print "%s, %s, %s" % (group, len(h), len(c)) values = {} if self.filter != None: mask = np.in1d(h, self.filter.classes_) values = dict(zip(self.filter.transform(np.array(h, dtype=np.uint32)[mask]), np.array(c, dtype=np.uint32)[mask])) else: values = dict(zip(h, c)) if len(values) == 0: print "No data for %s" % group continue sa = SArray([values]) self.sa = self.sa.append(sa)
def generate(self, sf): Feature.validateData(sf, self.price) sma1 = sf[self.price].rolling_mean(-1 * self.MA1, 0) sma2 = sf[self.price].rolling_mean(-1 * self.MA2, 0) diff = sma1 - sma2 signals = [None] * len(sma1) for i in xrange(1, len(diff)): if diff[i] and diff[i - 1]: if diff[i] > 0 and diff[i - 1] < 0: signals[i] = 1 elif diff[i] < 0 and diff[i - 1] > 0: signals[i] = -1 else: signals[i] = 0 sf.add_column(SArray(signals), name=self.name) return self.name
def crossproduct(d): """ Create an SFrame containing the crossproduct of all provided options. Parameters ---------- d : dict Each key is the name of an option, and each value is a list of the possible values for that option. Returns ------- out : SFrame There will be a column for each key in the provided dictionary, and a row for each unique combination of all values. Example ------- settings = {'argument_1':[0, 1], 'argument_2':['a', 'b', 'c']} print crossproduct(settings) +------------+------------+ | argument_2 | argument_1 | +------------+------------+ | a | 0 | | a | 1 | | b | 0 | | b | 1 | | c | 0 | | c | 1 | +------------+------------+ [6 rows x 2 columns] """ _mt._get_metric_tracker().track('util.crossproduct') from graphlab import SArray d = [zip(d.keys(), x) for x in _itertools.product(*d.values())] sa = [{k: v for (k, v) in x} for x in d] return SArray(sa).unpack(column_name_prefix='')
def compute(cls, sf, period, column): gain, loss = 0, 0 rsi = [None] * len(sf[column]) j = 0 for i in xrange(len(sf[column])): v = sf[column][i] if v > 0: gain += v else: loss += -1 * v if i >= period - 1: if loss == 0 and gain == 0: rsi[i] = 50. elif loss == 0: rsi[i] = 100. else: rsi[i] = 100. - 100. / (1. + gain / float(loss)) u = sf[column][j] gain -= u if u > 0 else 0 loss += u if u < 0 else 0 j += 1 return SArray(rsi)
def data_frame_with_target(self, data_frame): """ :param data_frame: :type data_frame: DataFrame :return: :rtype: SFrame """ data_sframe = SFrame(data_frame.toPandas()) sentiment_array = data_sframe.select_column('sentiment') target_array = [] for x in sentiment_array: try: target_array.append(self.convert_func(x)) except Exception as ex: print len(target_array), 'get_sentiments', x target_array.append(3) print ex data_sframe.add_column(SArray(target_array, dtype=int), name='target') print data_sframe return data_sframe.dropna()
def createSArray(arr): return SArray(arr)
# For each customers get his personla recommendation and item item simillarity of what he is buying for x in customerIDs: cld2 = products_2[products_2['customerID'] == x] cld2['rating'] = cld2.groupby('product_code')['product_code'].transform('count') sf = graphlab.SFrame(data=cld2) #Based on his own Recommendation mf = graphlab.factorization_recommender.create(sf, user_id = 'customerID' , item_id = 'product_code', target='rating') # Selected Top 3 from his personal recommendation and from total recommendation personalRecommdation = mf.recommend_from_interactions([], k=3) cluster_Personal = [] cluster_Personal = personalRecommdation['product_code'] #print (cluster_Personal) sa = SArray(data=cluster_Personal) totalRecommdation = m.get_similar_items(items=sa, k=3, verbose=False) #totalRecommdation = m.recommend([x], k=3) cluster_Recommdation = [] cluster_Recommdation = totalRecommdation['similar'] print(cluster_Personal) print(cluster_Recommdation) #print (cluster_Recommdation) empty = [] empty.append(cluster_Recommdation) empty.append(cluster_Personal) for s in range (0, len(empty)): defaultValueList[s]= empty[s] makeitastring = ",".join(map(str, defaultValueList)) print (makeitastring) d1 = { 'customerID' : x, 'products' : makeitastring}
aggregatedData = aggregateData(data_matrix) # print(aggregatedData) aggregatedData.save('../data/musicbrainz/aggregatedData') aggregatedData.save('../data/musicbrainz/aggregatedData.csv', format='csv') # sed 's/[^a-zA-Z0-9,-.]/ /g' aggregatedData.csv > removePunctuation.csv # temp1 = pd.read_csv("data/musicbrainz/removePunctuation.csv", skipinitialspace=True) # temp1 = np.genfromtxt("../data/musicbrainz/removePunctuation.csv", delimiter=",", filling_values=None) temp = SArray('../data/musicbrainz/removePunctuation.csv') docs_tfidf = gl.text_analytics.tf_idf(temp) print(docs_tfidf) docs_tfidf.save('../data/musicbrainz/docs_tfidf') docs_tfidf.save('../data/musicbrainz/docs_tfidf.csv', format='csv') ######################################################################################################################## # print ("starting to load the data...\n") # # df1 = pd.read_csv("data/musicbrainz/track_artist_tags.csv", sep=';') #
class KMergeCorpus(object): def __init__(self, groups, reference, hashf, ks, ke, filter_file=None): self.groups = groups self.reference = reference self.hash = hashf self.ks = ks self.ke = ke self.selection = None self.filter = None if filter_file or isfile("hashes.k%s.npz" % ks): self.filter = LabelEncoder() self.filter.fit(np.load(filter_file)["hashes"]) pickle.dump(self.filter, open("encoder.k%s.pkl" % self.ks, "wb")) #self.filter = {h:i for i,h in enumerate(np.load(filter_file)["hashes"])} def __len__(self): return len(self.selection) def set_selection(self, s): self.selection = s def store(self): # build as sparse matrix first self.sa = SArray(dtype=dict) for i, group in enumerate(self.selection): #if self.selection and i not in self.selection: #if self.selection and group not in self.selection: # continue counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group) hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group) c = loader.create_list(counts_file,False) h = loader.create_list(hashes_file,True) print "%s, %s, %s" % (group, len(h), len(c)) values = {} if self.filter != None: mask = np.in1d(h, self.filter.classes_) values = dict(zip(self.filter.transform(np.array(h, dtype=np.uint32)[mask]), np.array(c, dtype=np.uint32)[mask])) else: values = dict(zip(h, c)) if len(values) == 0: print "No data for %s" % group continue sa = SArray([values]) self.sa = self.sa.append(sa) def __iter__(self): for i, group in enumerate(self.selection): #if self.selection and i not in self.selection: #if self.selection and group not in self.selection: # continue counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group) hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group) c = loader.create_list(counts_file,False) h = loader.create_list(hashes_file,True) values = [] if self.filter != None: mask = np.in1d(h, self.filter.classes_) values = zip(self.filter.transform(np.array(h)[mask]), np.array(c)[mask]) else: values = zip(h, c) if len(values) == 0: print "No data for %s" % group yield values