def test_save(self): t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame'.format(hdfs_prefix) t.save(path, format='binary') with fileio.open_file(os.path.join(path, '_metadata')) as f: metadata = pickle.load(f) self.assertListEqual([['id', 'val'], [int, str]], metadata) # TODO find some way to check the data fileio.delete(path)
def test_construct_auto_str_psv(self): path = '{}/user/xpatterns/files/test-frame.psv'.format(hdfs_prefix) res = XFrame(path) self.assertEqualLen(3, res) self.assertListEqual(['id', 'val'], res.column_names()) self.assertListEqual([int, str], res.column_types()) self.assertDictEqual({'id': 1, 'val': 'a'}, res[0]) self.assertDictEqual({'id': 2, 'val': 'b'}, res[1]) self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
def test_construct_str_xframe(self): # construct and XFrame given a saved xframe path = '{}/user/xpatterns/files/test-frame'.format(hdfs_prefix) res = XFrame(path, format='xframe') res = res.sort('id') self.assertEqualLen(3, res) self.assertListEqual(['id', 'val'], res.column_names()) self.assertListEqual([int, str], res.column_types()) self.assertDictEqual({'id': 1, 'val': 'a'}, res[0]) self.assertDictEqual({'id': 2, 'val': 'b'}, res[1]) self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
def test_construct_str_csv(self): # construct and XFrame given a text file # interpret as csv path = '{}/user/xpatterns/files/test-frame.txt'.format(hdfs_prefix) res = XFrame(path, format='csv') self.assertEqualLen(3, res) self.assertListEqual(['id', 'val'], res.column_names()) self.assertListEqual([int, str], res.column_types()) self.assertDictEqual({'id': 1, 'val': 'a'}, res[0]) self.assertDictEqual({'id': 2, 'val': 'b'}, res[1]) self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
def test_construct_auto_str_xframe(self): # construct an XFrame given a file with unrecognized file extension path = '{}/user/xpatterns/files/test-frame'.format(hdfs_prefix) res = XFrame(path) res = res.sort('id') self.assertEqualLen(3, res) self.assertListEqual(['id', 'val'], res.column_names()) self.assertListEqual([int, str], res.column_types()) self.assertDictEqual({'id': 1, 'val': 'a'}, res[0]) self.assertDictEqual({'id': 2, 'val': 'b'}, res[1]) self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
def test_save(self): t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame-csv'.format(hdfs_prefix) t.save(path, format='csv') with fileio.open_file(path + '.csv') as f: heading = f.readline().rstrip() self.assertEqual('id,val', heading) self.assertEqual('30,a', f.readline().rstrip()) self.assertEqual('20,b', f.readline().rstrip()) self.assertEqual('10,c', f.readline().rstrip()) fileio.delete(path + '.csv')
def test_construct_auto_dataframe(self): path = '{}/user/xpatterns/files/test-frame-auto.csv'.format(hdfs_prefix) res = XFrame(path) self.assertEqualLen(3, res) self.assertListEqual(['val_int', 'val_int_signed', 'val_float', 'val_float_signed', 'val_str', 'val_list', 'val_dict'], res.column_names()) self.assertListEqual([int, int, float, float, str, list, dict], res.column_types()) self.assertDictEqual({'val_int': 1, 'val_int_signed': -1, 'val_float': 1.0, 'val_float_signed': -1.0, 'val_str': 'a', 'val_list': ['a'], 'val_dict': {1: 'a'}}, res[0]) self.assertDictEqual({'val_int': 2, 'val_int_signed': -2, 'val_float': 2.0, 'val_float_signed': -2.0, 'val_str': 'b', 'val_list': ['b'], 'val_dict': {2: 'b'}}, res[1]) self.assertDictEqual({'val_int': 3, 'val_int_signed': -3, 'val_float': 3.0, 'val_float_signed': -3.0, 'val_str': 'c', 'val_list': ['c'], 'val_dict': {3: 'c'}}, res[2])
def load(cls, path): """ Load a model that was saved previously. Parameters ---------- path : str The path where the model files are stored. This is the same path that was passed to ``save``. There are three files/directories based on this path, with extensions '.model', '.ratings', and '.metadata'. Returns ------- out : MatrixFactorizationModel A model that can be used to predict ratings. """ sc = CommonSparkContext.Instance().sc() model_path, ratings_path, metadata_path = cls._file_paths(path) # load model model = recommendation.MatrixFactorizationModel.load(sc, model_path) # load ratings ratings = XFrame.load(ratings_path) # load metadata with open(metadata_path) as f: user_col, item_col, rating_col = pickle.load(f) return cls(model, ratings, user_col, item_col, rating_col)
def test_read_text(self): path = '{}/user/xpatterns/files/test-frame-text.txt'.format(hdfs_prefix) res = XFrame.read_text(path) self.assertEqualLen(3, res) self.assertListEqual(['text', ], res.column_names()) self.assertListEqual([str], res.column_types()) self.assertDictEqual({'text': 'This is a test'}, res[0]) self.assertDictEqual({'text': 'of read_text.'}, res[1]) self.assertDictEqual({'text': 'Here is another sentence.'}, res[2])
def test_read_parquet_str(self): t = XFrame({'id': [1, 2, 3], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame-parquet'.format(hdfs_prefix) t.save(path, format='parquet') res = XFrame('{}/tmp/frame-parquet.parquet'.format(hdfs_prefix)) # results may not come back in the same order res = res.sort('id') self.assertEqualLen(3, res) self.assertListEqual(['id', 'val'], res.column_names()) self.assertListEqual([int, str], res.column_types()) self.assertDictEqual({'id': 1, 'val': 'a'}, res[0]) self.assertDictEqual({'id': 2, 'val': 'b'}, res[1]) self.assertDictEqual({'id': 3, 'val': 'c'}, res[2]) fileio.delete(path)
def __init__(self, features, labels, standardize=False): self.standardize = standardize self.means = None self.stdevs = None if standardize: self.features = self._standardize(features) else: self.features = features self.labels = labels self.feature_cols = features.column_names() labeled_feature_vector = XFrame(features) label_col = 'label' # TODO what if there is a feature with this name ? feature_cols = self.feature_cols # need local reference labeled_feature_vector[label_col] = labels def build_labeled_features(row): label = row[label_col] features =[row[col] for col in feature_cols] return LabeledPoint(label, features) self.labeled_feature_vector = labeled_feature_vector.apply(build_labeled_features)
def predict_all(self, user): """ Predict ratings for all items. Parameters ---------- user : int The user to make predictions for. Returns ------- out : XFrame Each row of the frame consists of a user id, an item id, and a predicted rating. """ # build rdd to pass to predictAll user_item = XFrame() user_item[self.item_col] = self.items user_item[self.user_col] = user user_item.swap_columns(self.item_col, self.user_col) rdd = user_item.to_rdd() res = self.model.predictAll(rdd) res = res.map(lambda rating: (rating.user, rating.product, rating.rating)) col_names = [self.user_col, self.item_col, self.rating_col] user_type = self.users.dtype() item_type = self.items.dtype() col_types = [user_type, item_type, float] return XFrame.from_rdd(res, column_names=col_names, column_types=col_types)
def _base_evaluate(self, data, labels): """ Evaluate the performance of the classifier. Use the data to make predictions, then test the effectiveness of the predictions against the labels. The data must be a collection of items (XArray of SenseVector). Returns ------- out : A list of: - overall correct prediction proportion - true positive proportion - true negative proportion - false positive proportion - false negative proportion """ results = XFrame() predictions = self._base_predict(data) results['predicted'] = predictions results['actual'] = labels # print results def evaluate(row): prediction = row['predicted'] actual = row['actual'] return {'correct': 1 if prediction == actual else 0, 'true_pos': 1 if prediction == 1 and actual == 1 else 0, 'true_neg': 1 if prediction == 0 and actual == 0 else 0, 'false_pos': 1 if prediction == 1 and actual == 0 else 0, 'false_neg': 1 if prediction == 0 and actual == 1 else 0, 'positive': 1 if actual == 1 else 0, 'negative': 1 if actual == 0 else 0 } score = results.apply(evaluate) def sum_item(item): return score.apply(lambda x: x[item]).sum() all_scores = float(len(labels)) correct = float(sum_item('correct')) tp = float(sum_item('true_pos')) tn = float(sum_item('true_neg')) fp = float(sum_item('false_pos')) fn = float(sum_item('false_neg')) pos = float(sum_item('positive')) neg = float(sum_item('negative')) # precision = true pos / (true pos + false pos) # recall = true pos / (true pos + false neg) # true pos rate = true pos / positive # false pos rate = false pos / negative result = {} result['correct'] = correct result['true_pos'] = tp result['true_neg'] = tn result['false_pos'] = fp result['false_neg'] = fn result['all'] = all_scores result['accuracy'] = correct / all_scores if all_scores > 0 else float('nan') result['precision'] = tp / (tp + fp) if (tp + fp) > 0 else float('nan') result['recall'] = tp / (tp + fn) if (tp + fn) > 0 else float('nan') result['tpr'] = tp / pos if pos > 0 else float('nan') result['fpr'] = fp / neg if neg > 0 else float('nan') return result
from xframes import XFrame xf = XFrame({'id': [1, 2, 3], 'val': ['a', 'b', 'c']}) print xf
def test_save(self): t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']}) path = '{}/tmp/frame-parquet'.format(hdfs_prefix) t.save(path, format='parquet') # TODO verify fileio.delete(path + '.parquet')