Example #1
0
 def test_save(self):
     t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']})
     path = '{}/tmp/frame'.format(hdfs_prefix)
     t.save(path, format='binary')
     with fileio.open_file(os.path.join(path, '_metadata')) as f:
         metadata = pickle.load(f)
     self.assertListEqual([['id', 'val'], [int, str]], metadata)
     # TODO find some way to check the data
     fileio.delete(path)
Example #2
0
 def test_construct_auto_str_psv(self):
     path = '{}/user/xpatterns/files/test-frame.psv'.format(hdfs_prefix)
     res = XFrame(path)
     self.assertEqualLen(3, res)
     self.assertListEqual(['id', 'val'], res.column_names())
     self.assertListEqual([int, str], res.column_types())
     self.assertDictEqual({'id': 1, 'val': 'a'}, res[0])
     self.assertDictEqual({'id': 2, 'val': 'b'}, res[1])
     self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
Example #3
0
 def test_construct_str_xframe(self):
     # construct and XFrame given a saved xframe
     path = '{}/user/xpatterns/files/test-frame'.format(hdfs_prefix)
     res = XFrame(path, format='xframe')
     res = res.sort('id')
     self.assertEqualLen(3, res)
     self.assertListEqual(['id', 'val'], res.column_names())
     self.assertListEqual([int, str], res.column_types())
     self.assertDictEqual({'id': 1, 'val': 'a'}, res[0])
     self.assertDictEqual({'id': 2, 'val': 'b'}, res[1])
     self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
Example #4
0
 def test_construct_str_csv(self):
     # construct and XFrame given a text file
     # interpret as csv
     path = '{}/user/xpatterns/files/test-frame.txt'.format(hdfs_prefix)
     res = XFrame(path, format='csv')
     self.assertEqualLen(3, res)
     self.assertListEqual(['id', 'val'], res.column_names())
     self.assertListEqual([int, str], res.column_types())
     self.assertDictEqual({'id': 1, 'val': 'a'}, res[0])
     self.assertDictEqual({'id': 2, 'val': 'b'}, res[1])
     self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
Example #5
0
 def test_construct_auto_str_xframe(self):
     # construct an XFrame given a file with unrecognized file extension
     path = '{}/user/xpatterns/files/test-frame'.format(hdfs_prefix)
     res = XFrame(path)
     res = res.sort('id')
     self.assertEqualLen(3, res)
     self.assertListEqual(['id', 'val'], res.column_names())
     self.assertListEqual([int, str], res.column_types())
     self.assertDictEqual({'id': 1, 'val': 'a'}, res[0])
     self.assertDictEqual({'id': 2, 'val': 'b'}, res[1])
     self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
Example #6
0
    def test_save(self):
        t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']})
        path = '{}/tmp/frame-csv'.format(hdfs_prefix)
        t.save(path, format='csv')

        with fileio.open_file(path + '.csv') as f:
            heading = f.readline().rstrip()
            self.assertEqual('id,val', heading)
            self.assertEqual('30,a', f.readline().rstrip())
            self.assertEqual('20,b', f.readline().rstrip())
            self.assertEqual('10,c', f.readline().rstrip())
        fileio.delete(path + '.csv')
Example #7
0
 def test_construct_auto_dataframe(self):
     path = '{}/user/xpatterns/files/test-frame-auto.csv'.format(hdfs_prefix)
     res = XFrame(path)
     self.assertEqualLen(3, res)
     self.assertListEqual(['val_int', 'val_int_signed', 'val_float', 'val_float_signed',
                           'val_str', 'val_list', 'val_dict'], res.column_names())
     self.assertListEqual([int, int, float, float, str, list, dict], res.column_types())
     self.assertDictEqual({'val_int': 1, 'val_int_signed': -1, 'val_float': 1.0, 'val_float_signed': -1.0,
                           'val_str': 'a', 'val_list': ['a'], 'val_dict': {1: 'a'}}, res[0])
     self.assertDictEqual({'val_int': 2, 'val_int_signed': -2, 'val_float': 2.0, 'val_float_signed': -2.0,
                           'val_str': 'b', 'val_list': ['b'], 'val_dict': {2: 'b'}}, res[1])
     self.assertDictEqual({'val_int': 3, 'val_int_signed': -3, 'val_float': 3.0, 'val_float_signed': -3.0,
                           'val_str': 'c', 'val_list': ['c'], 'val_dict': {3: 'c'}}, res[2])
    def load(cls, path):
        """
        Load a model that was saved previously.

        Parameters
        ----------
        path : str
            The path where the model files are stored.
            This is the same path that was passed to ``save``.
            There are three files/directories based on this path, with
            extensions '.model', '.ratings', and '.metadata'.

        Returns
        -------
        out : MatrixFactorizationModel
            A model that can be used to predict ratings.
        """
        sc = CommonSparkContext.Instance().sc()
        model_path, ratings_path, metadata_path = cls._file_paths(path)
        # load model
        model = recommendation.MatrixFactorizationModel.load(sc, model_path)
        # load ratings
        ratings = XFrame.load(ratings_path)
        # load metadata
        with open(metadata_path) as f:
            user_col, item_col, rating_col = pickle.load(f)

        return cls(model, ratings, user_col, item_col, rating_col)
Example #9
0
 def test_read_text(self):
     path = '{}/user/xpatterns/files/test-frame-text.txt'.format(hdfs_prefix)
     res = XFrame.read_text(path)
     self.assertEqualLen(3, res)
     self.assertListEqual(['text', ], res.column_names())
     self.assertListEqual([str], res.column_types())
     self.assertDictEqual({'text': 'This is a test'}, res[0])
     self.assertDictEqual({'text': 'of read_text.'}, res[1])
     self.assertDictEqual({'text': 'Here is another sentence.'}, res[2])
Example #10
0
    def test_read_parquet_str(self):
        t = XFrame({'id': [1, 2, 3], 'val': ['a', 'b', 'c']})
        path = '{}/tmp/frame-parquet'.format(hdfs_prefix)
        t.save(path, format='parquet')

        res = XFrame('{}/tmp/frame-parquet.parquet'.format(hdfs_prefix))
        # results may not come back in the same order
        res = res.sort('id')
        self.assertEqualLen(3, res)
        self.assertListEqual(['id', 'val'], res.column_names())
        self.assertListEqual([int, str], res.column_types())
        self.assertDictEqual({'id': 1, 'val': 'a'}, res[0])
        self.assertDictEqual({'id': 2, 'val': 'b'}, res[1])
        self.assertDictEqual({'id': 3, 'val': 'c'}, res[2])
        fileio.delete(path)
Example #11
0
    def __init__(self, features, labels, standardize=False):
        self.standardize = standardize
        self.means = None
        self.stdevs = None
        if standardize:
            self.features = self._standardize(features)
        else:
            self.features = features
        self.labels = labels
        self.feature_cols = features.column_names()
        labeled_feature_vector = XFrame(features)
        label_col = 'label'     # TODO what if there is a feature with this name ?
        feature_cols = self.feature_cols   # need local reference
        labeled_feature_vector[label_col] = labels
        def build_labeled_features(row):
            label = row[label_col]
            features =[row[col] for col in feature_cols]
            return LabeledPoint(label, features)

        self.labeled_feature_vector = labeled_feature_vector.apply(build_labeled_features)
    def predict_all(self, user):
        """
        Predict ratings for all items.

        Parameters
        ----------
        user : int
            The user to make predictions for.

        Returns
        -------
        out : XFrame
            Each row of the frame consists of a user id, an item id, and a predicted rating.
        """

        # build rdd to pass to predictAll
        user_item = XFrame()
        user_item[self.item_col] = self.items
        user_item[self.user_col] = user
        user_item.swap_columns(self.item_col, self.user_col)
        rdd = user_item.to_rdd()
        res = self.model.predictAll(rdd)
        res = res.map(lambda rating: (rating.user, rating.product, rating.rating))
        col_names = [self.user_col, self.item_col, self.rating_col]
        user_type = self.users.dtype()
        item_type = self.items.dtype()
        col_types = [user_type, item_type, float]
        return XFrame.from_rdd(res, column_names=col_names, column_types=col_types)
Example #13
0
    def _base_evaluate(self, data, labels):
        """
        Evaluate the performance of the classifier.

        Use the data to make predictions, then test the effectiveness of 
        the predictions against the labels.

        The data must be a collection of items (XArray of SenseVector).

        Returns
        -------
        out : A list of:
            - overall correct prediction proportion
            - true positive proportion
            - true negative proportion
            - false positive proportion
            - false negative proportion
        """
        results = XFrame()
        predictions = self._base_predict(data)
        results['predicted'] = predictions
        results['actual'] = labels
#        print results
        def evaluate(row):
            prediction = row['predicted']
            actual = row['actual']
            return {'correct': 1 if prediction == actual else 0,
                    'true_pos': 1 if prediction == 1 and actual == 1 else 0,
                    'true_neg': 1 if prediction == 0 and actual == 0 else 0,
                    'false_pos': 1 if prediction == 1 and actual == 0 else 0,
                    'false_neg': 1 if prediction == 0 and actual == 1 else 0,
                    'positive': 1 if actual == 1 else 0,
                    'negative': 1 if actual == 0 else 0
                    }

        score = results.apply(evaluate)
        def sum_item(item):
            return score.apply(lambda x: x[item]).sum()

        all_scores = float(len(labels))
        correct = float(sum_item('correct'))
        tp = float(sum_item('true_pos'))
        tn = float(sum_item('true_neg'))
        fp = float(sum_item('false_pos'))
        fn = float(sum_item('false_neg'))
        pos = float(sum_item('positive'))
        neg = float(sum_item('negative'))

        # precision = true pos / (true pos + false pos)
        # recall = true pos / (true pos + false neg)
        # true pos rate = true pos / positive
        # false pos rate = false pos / negative
        result = {}
        result['correct'] = correct
        result['true_pos'] = tp
        result['true_neg'] = tn
        result['false_pos'] = fp
        result['false_neg'] = fn
        result['all'] = all_scores
        result['accuracy'] = correct / all_scores if all_scores > 0 else float('nan')
        result['precision'] = tp / (tp + fp) if (tp + fp) > 0 else float('nan')
        result['recall'] = tp / (tp + fn) if (tp + fn) > 0 else float('nan')
        result['tpr'] = tp / pos if pos > 0 else float('nan')
        result['fpr'] = fp / neg if neg > 0 else float('nan')
        return result
Example #14
0
from xframes import XFrame

xf = XFrame({'id': [1, 2, 3], 'val': ['a', 'b', 'c']})
print xf
Example #15
0
 def test_save(self):
     t = XFrame({'id': [30, 20, 10], 'val': ['a', 'b', 'c']})
     path = '{}/tmp/frame-parquet'.format(hdfs_prefix)
     t.save(path, format='parquet')
     # TODO verify
     fileio.delete(path + '.parquet')