def save_data(image_test): """Save image_train data to csv""" # inspect the images in the data set print "*********************" print "expecting 4000 images:", len(image_test) print "*********************" # [column_names] # 0000=id # 0001=image # 0002=label # 0003=deep_features # 0004=image_array print "*********************" print "save image labels to csv" all_image_labels = image_test['label'] image_labels = gl.SArray(all_image_labels) all_image_labels.save('all_image_labels.csv') print "*********************" all_image_ids = image_test['id'].astype(int) image_id = gl.SArray(all_image_ids) print "*********************" print "create a csv file concatenated with the id and label" image_id_and_label = gl.SFrame({'ids': image_id, 'label': image_labels}) image_id_and_label.save('image_id_and_label.csv') print "*********************"
def evaluate_recommendation(self, recom, selected_authors=None, n_recommendations=10): all_ratios = [] if not selected_authors: selected_authors = self.selected_authors exisiting_collabs_list = [] for a in selected_authors: n_existing_collabs = 0 a_papers = self.topic_model.corpus.documents_by_author(a[0]) a_collabs = [self.topic_model.corpus.authors(b) for b in a_papers] try: a_recoms = recom.get_similar_users(gl.SArray( [a[0]]), 1000).sort("distance")[:n_recommendations] except: a_recoms = recom.get_similar_items(gl.SArray([a[0]]), n_recommendations) pass a_similar = a_recoms["similar"] for sim in a_similar: for collabs in a_collabs: if sim in collabs: n_existing_collabs += 1 if not [a[0], sim] in exisiting_collabs_list and not [ sim, a[0] ] in exisiting_collabs_list: exisiting_collabs_list.append(sorted([a[0], sim])) break all_ratios.append( float(n_existing_collabs) / (len(a_recoms) + .00001)) return all_ratios, exisiting_collabs_list
def _array_to_sframe(self, data, targets=None): d = dict() for i in xrange(data.shape[1]): d['feat_%d' % (i + 1)] = gl.SArray(data[:, i]) if targets is not None: d['target'] = gl.SArray(targets) return gl.SFrame(d)
def predict_options(options): """ Run predictions on potential options :param options: array of dictionary, expected format [{"user": __, "content.id": __}] :return: an array with predicted scores for each option; None if invalid """ # TODO - Need to format option in a way that makes sense for the predictor if os.path.exists(MODEL_LOCATION): model = gl.load_model(MODEL_LOCATION) else: logger.warn("couldn't load module, re-training", exc_info=True) model = train() if "user" in options[0] and "content_id" in options[0]: temp_users = [] temp_content = [] for option in options: temp_users.append(option["user"]) temp_content.append(option["content_id"]) users = gl.SArray(temp_users) content = gl.SArray(temp_content) frame = gl.SFrame({ "user": users, "content_id": content }, format="dict") prediction = model.predict(frame) logger.info("prediction: ", str(prediction)) else: logger.error( "options not in the correct format, expected key 'user' and key 'content_id'" ) prediction = None return list(prediction)
def classify(self, path, h=48, w=48, channels=1): """Classify the image :param path: path to image file :param h: image height :param w: image width :param channels: number of channels for the image :return: classifications """ image = gl.Image(path) data = image.pixel_data.copy() image, face = self._image_processor.process_image(data) if face is None: return None face = face.flatten() face = face - np.mean(face) face /= np.std(face) fmin = np.min(face) fmax = np.max(face) face = np.floor(255 * (face - fmin) / (fmax - fmin)) face_arr = gl.SArray([face.tolist()]) clf_image = face_arr.pixel_array_to_image(h, w, channels, allow_rounding=True) x = gl.SFrame({'images': clf_image}) classifications = self._classifier(x) return image, clf_image[0], classifications
def _convert_to_SArray(cls, value): ''' Convert an input value to SArray, the logic is: list => an SArray of len(list) rows other => an SArray of one row Parameters ---------- value : any type The value to be converted Returns ------- (success, converted) : pair(bool, SArray | value) 'success' indicates if the conversion is successful, if successful, 'converted' contains the converted value otherwise, 'converted' is original value ''' converted = value if not isinstance(converted, list): converted = [converted] # create an SArray now try: return (True, graphlab.SArray(converted)) except Exception as e: logging.info( "Hit exception trying to convert input %s to SArray. Error: %s" % (value, e.message)) return (False, value)
def clean_up(self): """Clean up after video image capturing This is where the actual classification is done """ if self._capture: self._capture.release() self.transformed_images = None self.original_images = None if self.images is not None: count = 0 self.transformed_images = [] self.original_images = [] images_to_gl = [] for im in self.images: if im[0] is None or im[1] is None: continue self.original_images.append(im[0]) self.transformed_images.append(im[1]) images_to_gl.append(im[1].flatten().tolist()) count += 1 x = gl.SArray(images_to_gl) x.pixel_array_to_image(self._w, self._h, self._d) x = gl.SFrame({'images': x}) if self.classifier is not None: self._classifications = self.classifier(x)
def between(row): if len(row['distances']) != n_dimensions: return None x = gl.SArray(row['distances']) if x.std() > .15: return None return x.mean() + x.std()
def bipartition(cluster, maxiter=400, num_runs=4, seed=None): '''cluster: should be a dictionary containing the following keys * dataframe: original dataframe * matrix: same data, in matrix format * centroid: centroid for this particular cluster''' data_matrix = cluster['matrix'] dataframe = cluster['dataframe'] # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow. kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1) kmeans_model.fit(data_matrix) centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_ # Divide the data matrix into two parts using the cluster assignments. data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], data_matrix[cluster_assignment==1] # Divide the dataframe into two parts, again using the cluster assignments. cluster_assignment_sa = graphlab.SArray(cluster_assignment) # minor format conversion dataframe_left_child, dataframe_right_child = dataframe[cluster_assignment_sa==0], dataframe[cluster_assignment_sa==1] # Package relevant variables for the child clusters cluster_left_child = {'matrix': data_matrix_left_child, 'dataframe': dataframe_left_child, 'centroid': centroids[0]} cluster_right_child = {'matrix': data_matrix_right_child, 'dataframe': dataframe_right_child, 'centroid': centroids[1]} return (cluster_left_child, cluster_right_child)
def triple_apply_knn(features): def graph_JSD(src,edge,dst): P = src['X1'] Q = dst['X1'] _P = P / norm(P, ord=1) _Q = Q / norm(Q, ord=1) _M = 0.5 * (_P + _Q) edge['distance'] = 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) return (src, edge, dst) n = len(features) sf = gl.SFrame(features) sf = sf.add_row_number('row_id') sg = gl.SGraph().add_vertices(sf, vid_field='row_id') edges = [gl.Edge(u, v, attr={'distance': None}) for (u, v) in itertools.combinations(range(n), 2)] sg = sg.add_edges(edges) sg_dist = sg.triple_apply(graph_JSD, mutated_fields=['distance']) #knn = sg_dist.edges.groupby("__src_id", {"knn" : gl.aggregate.CONCAT("__dst_id","distance")}).sort("__src_id") #top_neighbors = knn.apply(lambda row: sorted(row['knn'],key=row['knn'].get)[:N]) top_neighbors = [] for idx in xrange(n): topN_sf = sg_dist.get_edges(src_ids=[idx,None],dst_ids=[None,idx]).topk('distance',k=N,reverse=True) topN = topN_sf.apply(lambda row: row['__src_id'] if row['__dst_id']==idx else row['__dst_id']) top_neighbors.append(topN) return gl.SArray(top_neighbors)
def _detect_array(self, arr, class_score_threshold=0.5, nms_threshold=0.3): try: import graphlab as _gl except ImportError: import sframe as _gl except ImportError: raise ImportError('Require GraphLab Create or SFrame') ret = _gl.SFrame() det_cnt = [0] * len(arr) for i in range(len(arr)): tmp = self._postprocess(self._detect(arr[i]), class_score_threshold, nms_threshold) det_cnt[i] = len(tmp) # if nothing detected: if len(tmp) == 0: if self._executor_with_feature: tmp = _gl.SFrame({"box":[[0.,0.,0.,0.]], "class":["nothing"], "score":[0.], "feature":[[0.]]}) else: tmp = _gl.SFrame({"box":[[0.,0.,0.,0.]], "class":["nothing"], "score":[0.]}) det_cnt[i] = 1 ret = ret.append(tmp) image_id = [] for i in range(len(arr)): image_id.extend([i] * det_cnt[i]) ret.add_column(_gl.SArray(image_id), "id") return ret
def _create_images(self, x): sarray = gl.SArray(x) images = sarray.pixel_array_to_image(self._w, self._h, self._d, allow_rounding=True) return images
def _wrap_function_return(val): """ Recursively walks each thing in val, opening lists and dictionaries, converting all occurances of UnityGraphProxy to an SGraph, UnitySFrameProxy to SFrame, and UnitySArrayProxy to SArray. """ if type(val) == _UnityGraphProxy: return _gl.SGraph(_proxy=val) elif type(val) == _UnitySFrameProxy: return _gl.SFrame(_proxy=val) elif type(val) == _UnitySArrayProxy: return _gl.SArray(_proxy=val) elif type(val) == _UnityModel: # we need to cast it up to the appropriate type try: if '__uid__' in val.list_fields(): uid = val.get('__uid__') if uid in class_uid_to_class: return class_uid_to_class[uid](_proxy=val) except: pass return val elif type(val) == list: return [_wrap_function_return(i) for i in val] elif type(val) == dict: return {i: _wrap_function_return(val[i]) for i in val} else: return val
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. Parameters ---------- type_tag : The name of the glc class as saved in the GLC pickler. gl_archive_abs_path: An absolute path to the GLC archive where the object was saved. Returns ---------- The GLC object. """ if type_tag == "SFrame": obj = _gl.SFrame(gl_archive_abs_path) elif type_tag == "SGraph": obj = _gl.load_graph(gl_archive_abs_path) elif type_tag == "SArray": obj = _gl.SArray(gl_archive_abs_path) elif type_tag == "Model": obj = _gl.load_model(gl_archive_abs_path) else: raise _pickle.UnpicklingError( "GraphLab pickling Error: Unspported object." " Only SFrames, SGraphs, SArrays, and Models are supported.") return obj
def uniform_numeric_column(n, col_type=float, range=(0, 1), missingness=0.): """ Return an SArray of uniformly random numeric values. Parameters ---------- n : int Number of entries in the output SArray. col_type : type, optional Type of the output SArray. Default is floats. range : tuple[int, int], optional Minimum and maximum of the uniform distribution from which values are chosen. missingness : float, optional Probability that a given entry in the output is missing. Returns ------- out : SArray """ if col_type == int: v = np.random.randint(low=range[0], high=range[1], size=n).astype(float) else: v = np.random.rand(n) v = v * (range[1] - range[0]) + range[0] idx_na = np.random.rand(n) < missingness v[idx_na] = None v = np.where(np.isnan(v), None, v) return gl.SArray(v, dtype=col_type)
def parse_speakers(filename): """ Clean up the description for each speaker. """ speakers = gl.SArray(filename)[0] speakers = gl.SFrame({ 'url_suffix': speakers.keys(), 'data': speakers.values() }) def clean_bio(y): x = y['bio'] x = x.replace('Website', '') x = x.replace('| Attendee Directory', '') x = x.replace('Profile', '') x = x.replace('\n', ' ') x = x.replace('\r', '') y['bio'] = x.strip() return y speakers['data'] = speakers['data'].apply(clean_bio) sdict = {} for s in speakers: k = s['url_suffix'] sdict[k] = s['data'] return sdict, speakers
def polynomial_sframe(sf, feature, degree): sf_feature_power = graphlab.SFrame() sf_feature_power[feature] = sf[feature] if degree > 1: for i in range(2, degree + 1): sf_feature_power[feature + '_power_' + str(i)] = graphlab.SArray( sf[feature]).apply(lambda x: x**i) return sf_feature_power
def loadData(filename, type=None): data = {} data['label'] = [] data['image'] = [] input = [] output = [] size = 128, 128 # print filename with open(filename, 'r') as f: for line in f: train = line.strip().split() if type == 'train': if len(train) < 50: continue classID = int(train[0]) for j in range(3): output.append(classID) #105*122 features image = [] for i in range(12810): image.append(0.0) #initialize for i in range(1, len(train)): pixel = train[i].split(':') k = random.random() if j == 0: k = 1 if float(pixel[1]) < 0.1: pixel[1] = 0 elif float(pixel[1]) < 0.4 and k < 0.3: pixel[1] = 0 else: pixel[1] = 1 image[int(pixel[0]) - 1] = float(pixel[1]) * 255 input.append(image) if type == 'test': break data['label'] = gl.SArray(output) scaled_input = gl.SArray(input) img_sarray = gl.SArray.pixel_array_to_image(scaled_input, 105, 122, 1, allow_rounding=True) data['image'] = img_sarray return data
def test_rebuild_bitset(self): signal = gl.SFrame(gen_signal()) nb_layers = signal['layer'].max() + 1 # starts at 0 res = sptgraph_fast.aggregate_layers(signal, 'baseID', 'layer', nb_layers) l1 = res['layers'].apply(utils.reform_layer_int_from_blocks) l2 = map(lambda x: int(x, 2), res['layers'].apply(sptgraph_fast.flex_bitset_to_flex_string)) m = l1 == gl.SArray(l2) self.assertTrue(m.all(), 'Layers should be equal')
def save_as_train_and_test(X, train_loc, valid_loc): # Can't just randomly sample the indices all_names = list(X["name"].unique()) n_valid = (2 * len(all_names)) / 100 random.shuffle(all_names) tr_names = gl.SArray(all_names[n_valid:]) valid_names = gl.SArray(all_names[:n_valid]) X_train = X.filter_by(tr_names, 'name') X_valid = X.filter_by(valid_names, 'name') X_train.save(train_loc) X_valid.save(valid_loc)
def find_communities(sgraph, threads=4): timestamp = datetime.now().strftime('%Y%m%d%H%M%S') timestamp = timestamp + str(random.randint(1, 1000)) input_f = _cache_dir + '/input/%s.txt' % timestamp if not os.path.exists(os.path.dirname(input_f)): os.makedirs(os.path.dirname(input_f)) output_dir = _cache_dir + '/output/' if not os.path.exists(os.path.dirname(output_dir)): os.makedirs(os.path.dirname(output_dir)) #input file vertices = sgraph.get_vertices() vertices.sort('__id') v_idx_map = dict([(x['__id'], i) for i, x in enumerate(vertices)]) with open(input_f, 'w') as f: for row in sgraph.edges: f.write('%s %s %s \n' % (v_idx_map[row['__src_id']], v_idx_map[row['__dst_id']], row['weight'])) #run relaxmap command = "%(_this_dir)s/ompRelaxmap %(seed)s %(network_data)s %(threads)s %(attempts)s "+\ "%(threshold)s %(vThresh)s %(maxIter)s %(outDir)s %(prior)s >/dev/null 2>&1" params = { '_this_dir': _this_dir, 'seed': 1, 'network_data': input_f, 'threads': threads, 'attempts': 1, 'threshold': 1e-4, 'vThresh': 0.0, 'maxIter': 15, 'outDir': output_dir, 'prior': 'prior' } command = command % params os.system(command) #output file mdl = 0 node_comm_map = {} with open(output_dir + '%s.tree' % timestamp, 'r') as f: for i, line in enumerate(f): if i == 0: mdl = float(line.split(' ')[3]) continue comm_id = line.split(':')[0] node_id = line.split()[-1].strip('"') node_comm_map[node_id] = comm_id comm_ids = [] for i in range(vertices.num_rows()): comm_id = node_comm_map.get(str(i), random.choice(node_comm_map.values())) comm_ids.append(comm_id) #vertices sframe vertices['community_id'] = gl.SArray(comm_ids) return vertices, mdl
def tf_idf(dataset): """ Compute the TF-IDF scores for each word in each document. The collection of documents must be in bag-of-words format. .. math:: \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w)) where :math:`tf(w, d)` is the number of times word :math:`w` appeared in document :math:`d`, :math:`f(w)` is the number of documents word :math:`w` appeared in, :math:`N` is the number of documents, and we use the natural logarithm. This function is implemented using :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`. Parameters ---------- dataset : SArray[str | dict | list] Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF` documentation for details on how string, dict, and list inputs are handled. Returns ------- out : SArray[dict] The same document corpus where each score has been replaced by the TF-IDF transformation. See Also -------- count_words, count_ngrams, tokenize, graphlab.toolkits.feature_engineering._tfidf.TFIDF References ---------- - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_ Examples -------- .. sourcecode:: python >>> import graphlab >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs) """ _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf') _raise_error_if_not_sarray(dataset, "dataset") if len(dataset) == 0: return _graphlab.SArray() dataset = _graphlab.SFrame({'docs': dataset}) scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset) return scores['docs']
def adaboost_with_tree_stumps(data, features, target, num_tree_stumps): # start with unweighted data alpha = graphlab.SArray([1.] * len(data)) weights = [] tree_stumps = [] target_values = data[target] for t in xrange(num_tree_stumps): print '=====================================================' print 'Adaboost Iteration %d' % t print '=====================================================' # Learn a weighted decision tree stump. Use max_depth=1 tree_stump = weighted_decision_tree_create(data, features, target, data_weights=alpha, max_depth=6) tree_stumps.append(tree_stump) # Make predictions predictions = data.apply(lambda x: classify_weighted(tree_stump, x)) # Produce a Boolean array indicating whether # each data point was correctly classified is_correct = predictions == target_values is_wrong = predictions != target_values temp = data['weights'][is_correct == 0] # Compute weighted error #split_feature =tree_stump['splitting_feature'] #left_split = data[data[split_feature] == 0] #right_split = data[data[split_feature] == 1] #left_data_weights = left_split['weights'] #right_data_weights = right_split['weights'] #left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights) #right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights) weighted_error = temp.sum() / data['weights'].sum() # Compute model coefficient using weighted error weight = math.log((1 - weighted_error) / weighted_error) / 2.0 weights.append(weight) # Adjust weights on data point adjustment = is_correct.apply(lambda is_correct: math.exp(-weight) if is_correct else math.exp(weight)) # Scale alpha by multiplying by adjustment # Then normalize data points weights data_weights = data['weights'] * adjustment su = data_weights.sum() data_weights = data_weights.apply(lambda x: x / su) alpha = data_weights return weights, tree_stumps
def getData(self, reco): pn = self.queryAmazon(reco['ProductId']) pn = [ x.encode('utf-8') if len(x) != 0 else "Some Awesome Product" for x in pn ] reco.add_column(gl.SArray(pn), name='ProductName') rn = self.queryAmazon(reco['ProductId'], 'Images') rn = [ x.encode('utf-8') if len(x) != 0 else "static/img/default.jpeg" for x in rn ] reco.add_column(gl.SArray(rn), name='ProductURL') reco = reco.pack_columns( columns=['score', 'rank', 'ProductName', 'ProductURL'], new_column_name='Details') df = reco.to_dataframe().set_index('ProductId') recommendations = df.to_dict(orient='dict')['Details'] return recommendations
def save_as_train_and_test(X, train_loc, valid_loc, valid_split_percent=2): # Can't just randomly sample the indices all_names = list(X["name"].unique()) n_valid = (valid_split_percent * len(all_names)) / 100 random.shuffle(all_names) tr_names = gl.SArray(all_names[n_valid:]) valid_names = gl.SArray(all_names[:n_valid]) X_train = X.filter_by(tr_names, 'name') X_valid = X.filter_by(valid_names, 'name') print "Saving %d images in training set" % X_train.num_rows() X_train.save(train_loc) print "Saving %d images in validation set" % X_valid.num_rows() X_valid.save(valid_loc)
def loadData(filename, type=None, ignore_indices=[]): data = {} data['label'] = [] data['image'] = [] input = [] output = [] size = 128, 128 idx = 0 with open(filename, 'r') as f: for line in f: if idx in ignore_indices: idx += 1 continue else: idx += 1 train = line.strip().split() if type == 'train': if len(train) < 10: continue classID = int(train[0]) output.append(classID) #105*122 features image = [] for i in range(12810): image.append(0.0) for i in range(1, len(train)): pixel = train[i].split(':') image[int(pixel[0]) - 1] = float(pixel[1]) * 255 input.append(image) data['label'] = gl.SArray(output) scaled_input = gl.SArray(input) img_sarray = gl.SArray.pixel_array_to_image(scaled_input, 105, 122, 1, allow_rounding=True) data['image'] = img_sarray return data
def _get_density_estimate(self, predictions, bandwidth=0.1): ''' compute 200 numbers to represent the distribution of the predicted probability The output x and y are scaled to the range of [0, 1] ''' output_size = 200.0 def epanechnikov(u): if u > 1 or u < -1: return 0.0 else: return 0.75 - 0.75 * u * u # use binned data to approximate density probability = predictions['mean_probability'] weight = predictions['count'] total_weight = predictions['count'].sum() x_val = _gl.SArray(range(int(output_size))) x_val = x_val / output_size # [0.001, 0.002, ..., 0.999] estimate = [] for i, x in enumerate(x_val): est = 0 for j, prob in enumerate(probability): est += weight[j] * epanechnikov((x - prob) / bandwidth) est = est / total_weight / bandwidth estimate.append(est) # then scale the estimate to [0, 1] range estimate = _gl.SArray(estimate) est_max = estimate.max() est_min = min(estimate.min(), 0) if (est_max - est_min) > 0: estimate = (estimate - float(est_min)) / float(est_max - est_min) else: # min == max, it is a constant signal estimate = estimate * 0 + 0.5 # make it [0.5, ..., 0.5] ret = _gl.SFrame() ret['x'] = x_val ret['density'] = estimate return ret
def predict_adaboost(stump_weights, tree_stumps, data): scores = graphlab.SArray([0.] * len(data)) for i, tree_stump in enumerate(tree_stumps): predictions = data.apply(lambda x: classify(tree_stump, x)) # Accumulate predictions on scaores array # YOUR CODE HERE scores += stump_weights[i] * predictions return scores.apply(lambda score: +1 if score > 0 else -1)
def _get_instance_and_data(cls): from PIL import Image as _PIL_Image import random _format = {'JPG': 0, 'PNG': 1, 'RAW': 2, 'UNDEFINED': 3} # Note: This needs to be added to the OSS repo as an exposed function. def from_pil_image(pil_img): height = pil_img.size[1] width = pil_img.size[0] if pil_img.mode == 'L': image_data = bytearray([z for z in pil_img.getdata()]) channels = 1 elif pil_img.mode == 'RGB': image_data = bytearray( [z for l in pil_img.getdata() for z in l]) channels = 3 else: image_data = bytearray( [z for l in pil_img.getdata() for z in l]) channels = 4 format_enum = _format['RAW'] image_data_size = len(image_data) img = _gl.Image(_image_data=image_data, _width=width, _height=height, _channels=channels, _format_enum=format_enum, _image_data_size=image_data_size) return img num_examples = 100 dims = (28, 28) images = [] for i in range(num_examples): def rand_image(): return [random.randint(0, 255)] * (28 * 28) pil_img = _PIL_Image.new('RGB', dims) pil_img.putdata(list(zip(rand_image(), rand_image(), rand_image()))) images.append(from_pil_image(pil_img)) random_labels = random.randint(0, 1) data = _gl.SFrame({'image': _gl.SArray(images)}) data['label'] = random_labels nn_model = _gl.neuralnet_classifier.create(data, 'label') data.remove_column('label') extractor = _gl.feature_engineering.DeepFeatureExtractor( features=['image'], model=nn_model) extractor = extractor.fit(data) return extractor, data
def load_edges(cls, edge_csv, header=False): sf = gl.SFrame.read_csv(edge_csv, header=header, column_type_hints=str, verbose=False) assert sf.num_cols() in [2, 3], "edge_csv must be 2 or 3 columns" if sf.num_cols() == 2: sa = gl.SArray([1] * sf.num_rows()) sf.add_column(sa, 'weight') col_names = ['__src_id', '__dst_id', 'weight'] rename = dict(zip(sf.column_names(), col_names)) sf.rename(rename) return sf