def KFold(data, n_folds=10):
    """
    Create a K-Fold split of a data set as an iterable/indexable object of K pairs,
    where each pair is a partition of the dataset.  This can be useful for cross
    validation, where each fold is used as a held out dataset while training
    on the remaining data.

    Parameters
    ----------
    data: SFrame
        A Non empty SFrame.
    n_folds: int
        The number of folds to create. Must be at least 2.

    Notes
    -----
    This does not shuffle the data. Shuffling your data is a useful preprocessing step when doing cross validation.

    Yields
    -------
    (SArray, SArray)
        Yields train, test of each fold

    Examples
    --------
        >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
        >>> sf = tc.SFrame.read_csv(url)
        >>> folds = KFold(sf)
    """
    if data.num_rows() < n_folds:
        raise ValueError
    for st, end in _kfold_sections(data, n_folds):
        idx = np.zeros(len(data))
        idx[st:end] = 1
        yield data[tc.SArray(1 - idx)], data[tc.SArray(idx)]
    def test_exceptions(self):

        good1 = turicreate.SArray([{'a': 5, 'b': 7}])
        good2 = turicreate.SFrame({'bow': good1})
        good3 = turicreate.SArray([{}])
        bad1 = turicreate.SFrame({'x': [0, 1, 2, 3]})
        bad2 = turicreate.SFrame({'x': [{'0': 3}], 'y': [{'3': 5}]})
        bad3 = turicreate.SArray([{'a': 5, 'b': 3}, None, {'a': 10}])
        bad4 = turicreate.SArray([{'a': 5, 'b': None}, {'a': 3}])

        for d in [good1, good2, good3]:
            m = topic_model.create(d)
            self.assertTrue(m is not None)

        # Test that create() throws on bad input
        with self.assertRaises(Exception):
            m = topic_model.create(bad1)
        with self.assertRaises(Exception):
            m = topic_model.create(bad2)
        with self.assertRaises(ToolkitError):
            m = topic_model.create(bad3)
        with self.assertRaises(ToolkitError):
            m = topic_model.create(bad4)

        m = self.models[0]
        with self.assertRaises(Exception):
            pr = m.predict(bad1)
        with self.assertRaises(Exception):
            pr = m.predict(bad2)
        with self.assertRaises(Exception):
            pr = m.predict(bad3)
Beispiel #3
0
    def test_rmse(self):

        y = turicreate.SArray([1, 2, 1, 2])
        yhat = turicreate.SArray([3, -1, 1, 0])
        rmse = turicreate.toolkits.evaluation.rmse(y, yhat)
        true_rmse = (float(2 * 2 + 3 * 3 + 0 + 2 * 2) / 4) ** 0.5
        self.assertAlmostEqual(rmse, true_rmse)
Beispiel #4
0
    def test_confusion_matrix(self):
        y = turicreate.SArray([1, 1, 0, 1, 1, 0, 1])
        yhat = turicreate.SArray([0, 1, 0, 0, 1, 1, 0])

        res = turicreate.toolkits.evaluation.confusion_matrix(y, yhat)
        res = res.sort(["predicted_label", "target_label"])["count"]
        self.assertTrue((res == turicreate.SArray([1, 3, 1, 2])).all())
Beispiel #5
0
    def test_missing_values(self):
        # Arrange
        t, p = _generate_classes_and_scores(3, n=100, hard_predictions=True)
        pm = [None if x == 2 else x for x in p]
        tm = [None if x == 2 else x for x in t]

        targets = turicreate.SArray(tm)
        predictions = turicreate.SArray(pm)

        # Act & Assert [accuracy]
        skl_score = accuracy_score(t, p)
        score = turicreate.toolkits.evaluation.accuracy(targets, predictions)
        self.assertAlmostEqual(skl_score, score)

        # Act & Assert [precision]
        skl_score = precision_score(t, p, average="macro")
        score = turicreate.toolkits.evaluation.precision(targets, predictions)
        self.assertAlmostEqual(skl_score, score)

        # Act & Assert [recall]
        skl_score = recall_score(t, p, average="macro")
        score = turicreate.toolkits.evaluation.recall(targets, predictions)
        self.assertAlmostEqual(skl_score, score)

        # Act & Assert [f1_score]
        skl_score = f1_score(t, p, average="macro")
        score = turicreate.toolkits.evaluation.f1_score(targets, predictions)
        self.assertAlmostEqual(skl_score, score)

        # Act & Assert [fbeta_score]
        skl_score = fbeta_score(t, p, beta=2.0, average="macro")
        score = turicreate.toolkits.evaluation.fbeta_score(
            targets, predictions, beta=2.0
        )
        self.assertAlmostEqual(skl_score, score)
    def test_export_coreml(self):
        from PIL import Image
        import coremltools
        filename = tempfile.mkstemp('bingo.mlmodel')[1]
        self.model.export_coreml(filename,
                                 include_non_maximum_suppression=False)

        coreml_model = coremltools.models.MLModel(filename)
        img = self.train[0:1][self.feature][0]
        img_fixed = tc.image_analysis.resize(img, 416, 416, 3)
        pil_img = Image.fromarray(img_fixed.pixel_data)
        if _mac_ver() >= (10, 13):
            ret = coreml_model.predict({self.feature: pil_img},
                                       usesCPUOnly=True)
            self.assertEqual(ret['coordinates'].shape[1], 4)
            self.assertEqual(ret['confidence'].shape[1], len(_CLASSES))
            self.assertEqual(ret['coordinates'].shape[0],
                             ret['confidence'].shape[0])

        # Also check if we can train a second model and export it (there could
        # be naming issues in mxnet)
        filename2 = tempfile.mkstemp('bingo2.mlmodel')[1]
        # We also test at the same time if we can export a model with a single
        # class
        sf = tc.SFrame({
            'image': tc.SArray([self.train[self.feature][0]]),
            'label': tc.SArray([self.train[self.target][0]])
        })
        model2 = tc.one_shot_object_detector.create(sf,
                                                    'label',
                                                    max_iterations=1)
        model2.export_coreml(filename2, include_non_maximum_suppression=False)
Beispiel #7
0
    def test_bm25(self):
        """
        Check correctness of the BM2.5 query.
        """

        # Test input formats
        query = ['a', 'b', 'c']
        assert tc.text_analytics.bm25(self.data, query) is not None

        query = tc.SArray(['a', 'b', 'c'])
        assert tc.text_analytics.bm25(self.data, query) is not None

        query = {'a': 5, 'b': 3, 'c': 1}
        assert tc.text_analytics.bm25(self.data, query) is not None

        # Only documents containing query words are included in result
        assert tc.text_analytics.bm25(self.data, query).num_rows() == 4

        dataset = tc.SArray([{
            'a': 5,
            'b': 7,
            'c': 10
        }, {
            'a': 3,
            'c': 1,
            'd': 2
        }, None, {
            'a': 1
        }, {
            'f': 5
        }])

        res = tc.text_analytics.bm25(dataset, query)
        assert res.num_rows() == 3
    def get_backgrounds(self):
        # Download tar file, if not already downloaded
        # Get tar file path
        tarfile_path = _download_and_checksum_files(self.sarray_url_md5_pairs,
                                                    _get_cache_dir("data"))[0]

        # Extract SArray from tar file, if not already extracted
        if _os.path.exists(self.destination_sarray_path):
            backgrounds_tar = _tarfile.open(tarfile_path)
            backgrounds_tar.extractall(_get_cache_dir("data"))

        # Verify and load the extracted SArray
        try:
            # Check we extracted the file we expected
            expected_extracted_files = set(self.extracted_file_to_md5.keys())
            extracted_files = set(_os.listdir(self.destination_sarray_path))
            assert expected_extracted_files == extracted_files

            # Check each of the files is what we expect
            for filename, expected_md5 in self.extracted_file_to_md5.items():
                full_path = _os.path.join(_get_cache_dir("data"), filename)
                md5 = hashlib.md5(full_path).hexdigest()
                assert md5 == expected_md5

            backgrounds = _tc.SArray(self.destination_sarray_path)
        except:
            # delete the incompletely/incorrectly extracted tarball bits on disk
            if _os.path.exists(self.destination_sarray_path):
                _shutil.rmtree(self.destination_sarray_path)
            # and re-extract
            backgrounds_tar = _tarfile.open(tarfile_path)
            backgrounds_tar.extractall(_get_cache_dir("data"))
            backgrounds = _tc.SArray(self.destination_sarray_path)

        return backgrounds
Beispiel #9
0
def _get_data(feature, target):
    from PIL import Image as _PIL_Image

    rs = np.random.RandomState(1234)

    def from_pil_image(pil_img, image_format="png"):
        if image_format == "raw":
            image = np.array(pil_img)
            FORMAT_RAW = 2
            return tc.Image(
                _image_data=image.tobytes(),
                _width=image.shape[1],
                _height=image.shape[0],
                _channels=image.shape[2],
                _format_enum=FORMAT_RAW,
                _image_data_size=image.size,
            )
        else:
            with tempfile.NamedTemporaryFile(mode="w+b",
                                             suffix="." + image_format) as f:
                pil_img.save(f, format=image_format)
                return tc.Image(f.name)

    num_examples = 100
    num_starter_images = 5
    max_num_boxes_per_image = 10
    classes = _CLASSES
    images = []
    FORMATS = ["png", "jpeg", "raw"]
    for _ in range(num_examples):
        # Randomly determine image size (should handle large and small)
        img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, )
        img = rs.randint(255, size=img_shape)

        pil_img = _PIL_Image.fromarray(img, mode="RGB")
        # Randomly select image format
        image_format = FORMATS[rs.randint(len(FORMATS))]
        images.append(from_pil_image(pil_img, image_format=image_format))

    starter_images = []
    starter_target = []
    for i in range(num_starter_images):
        img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, )
        img = rs.randint(255, size=img_shape)
        pil_img = _PIL_Image.fromarray(img, mode="RGB")
        image_format = FORMATS[rs.randint(len(FORMATS))]
        starter_images.append(
            from_pil_image(pil_img, image_format=image_format))
        starter_target.append(_CLASSES[i % len(_CLASSES)])

    train = tc.SFrame({
        feature: tc.SArray(starter_images),
        target: tc.SArray(starter_target),
    })
    test = tc.SFrame({
        feature: tc.SArray(images),
    })
    backgrounds = test[feature].head(5)
    return train, test, backgrounds
Beispiel #10
0
    def test_roc_curve_str(self):
        y = turicreate.SArray(['a', 'b', 'a', 'b'])
        yhat = turicreate.SArray([.1, .2, .3, .4])

        res = turicreate.toolkits.evaluation.roc_curve(y, yhat)
        points = res[['fpr', 'tpr']].unique().sort(['fpr', 'tpr'])
        self.assertTrue(all(res['tpr'] >= 0) and all(res['tpr'] <= 1))
        self.assertTrue(all(res['fpr'] >= 0) and all(res['fpr'] <= 1))
Beispiel #11
0
    def test_roc_curve_str(self):
        y = turicreate.SArray(["a", "b", "a", "b"])
        yhat = turicreate.SArray([0.1, 0.2, 0.3, 0.4])

        res = turicreate.toolkits.evaluation.roc_curve(y, yhat)
        points = res[["fpr", "tpr"]].unique().sort(["fpr", "tpr"])
        self.assertTrue(all(res["tpr"] >= 0) and all(res["tpr"] <= 1))
        self.assertTrue(all(res["fpr"] >= 0) and all(res["fpr"] <= 1))
Beispiel #12
0
    def test_drop_words(self):
        ## Bogus input type
        sa = tc.SArray([1, 2, 3])
        with self.assertRaises(RuntimeError):
            text_analytics.drop_words(sa)

        sa = tc.SArray(["str", None])
        # no throw, just give warning and skip
        # avoid segfault
        stop_words = text_analytics.stop_words()
        text_analytics.drop_words(sa, stop_words=stop_words)

        ## Other languages
        expected = [
            "this is someurl http someurl this is someurl http someurl",
            "中文 应该也 行 中文 应该也 行",
            "Сблъсъкът между Сблъсъкът между",
        ]

        expected2 = [
            "This is someurl http someurl This is someurl http someurl",
            "中文 应该也 行 中文 应该也 行",
            "Сблъсъкът между Сблъсъкът между",
        ]

        result = text_analytics.drop_words(self.languages_double)
        self.assertEqual(result.dtype, str)
        self.sframe_comparer._assert_sarray_equal(result, expected)

        result = text_analytics.drop_words(self.languages_double,
                                           to_lower=False)
        self.assertEqual(result.dtype, str)
        self.sframe_comparer._assert_sarray_equal(result, expected2)

        ## Check that delimiters work properly by default and when modified.
        expected1 = [
            "this is some url http www someurl com this is some url http www someurl com",
            "should we yes we should should we yes we should",
        ]
        expected2 = [
            "this is some url http://www.someurl.com this is some url http://www.someurl.com",
            "should we yes we should. should we yes we should.",
        ]
        expected3 = ["url http www someurl url http www someurl", ""]

        word_counts1 = text_analytics.drop_words(self.punctuated_double)
        word_counts2 = text_analytics.drop_words(
            self.punctuated_double, delimiters=["?", "!", ",", " "])
        word_counts3 = text_analytics.drop_words(
            self.punctuated_double, stop_words=text_analytics.stop_words())

        self.assertEqual(word_counts1.dtype, str)
        self.sframe_comparer._assert_sarray_equal(word_counts1, expected1)
        self.assertEqual(word_counts2.dtype, str)
        self.sframe_comparer._assert_sarray_equal(word_counts2, expected2)
        self.assertEqual(word_counts3.dtype, str)
        self.sframe_comparer._assert_sarray_equal(word_counts3, expected3)
 def test_invalid_data_set(self):
     # infer dtype str
     a = tc.SArray(['str', None])
     b = tc.SArray(['str', 'str'])
     # target contains none
     sf = tc.SFrame({'a': a, 'b': b})
     with self.assertRaises(ToolkitError):
         tc.text_classifier.create(sf, target='a', features=['b'], word_count_threshold=1)
     # feature contains none, Github #2402
     sf = tc.SFrame({'b': a, 'a': b})
     with self.assertRaises(ToolkitError):
         tc.text_classifier.create(sf, target='b', features=['a'], word_count_threshold=1)
Beispiel #14
0
    def test_auc_multi_class_score(self):
        # Arrange
        t, p = _generate_classes_and_scores(3, n=100, hard_predictions=False)
        sk_p = {}
        sk_t = {}
        for i in range(3):
            sk_p[i] = p[:, i]
            sk_t[i] = t == i
        targets = turicreate.SArray(t)
        predictions = turicreate.SArray(p)
        str_targets = targets.astype(str)

        # Act
        sk_score = {}
        for i in range(3):
            sk_score[i] = roc_auc_score(sk_t[i], sk_p[i])

        # Act [Average = None]
        score = turicreate.toolkits.evaluation.auc(targets,
                                                   predictions,
                                                   average=None)
        str_score = turicreate.toolkits.evaluation.auc(str_targets,
                                                       predictions,
                                                       average=None)
        # Assert
        self.assertEqual(type(score), dict)
        self.assertEqual(set(score.keys()), set([0, 1, 2]))
        self.assertEqual(set(str_score.keys()), set(["0", "1", "2"]))

        # Note: Explicitly not putting it into a for loop for ease of
        # debugging when the tests fail.
        self.assertAlmostEqual(sk_score[0], score[0])
        self.assertAlmostEqual(sk_score[0], str_score['0'])
        self.assertAlmostEqual(sk_score[1], score[1])
        self.assertAlmostEqual(sk_score[1], str_score['1'])
        self.assertAlmostEqual(sk_score[2], score[2])
        self.assertAlmostEqual(sk_score[2], str_score['2'])

        # Act [Average = 'macro']
        score = turicreate.toolkits.evaluation.auc(targets,
                                                   predictions,
                                                   average='macro')
        str_score = turicreate.toolkits.evaluation.auc(str_targets,
                                                       predictions,
                                                       average='macro')
        avg_score = 0.0
        for i in range(3):
            avg_score += sk_score[i]
        avg_score /= 3.0
        self.assertAlmostEqual(avg_score, score)
        self.assertAlmostEqual(avg_score, str_score)
Beispiel #15
0
    def test_logloss_clipping(self):

        y = turicreate.SArray([0, 1, 2, 0])
        yhat = turicreate.SArray(
            [[0.9, 0.0, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]
        )
        log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat)
        self.assertTrue(log_loss != inf)

        y = turicreate.SArray([0, 1, 2, 0])
        yhat = turicreate.SArray(
            [[1.0, 0.0, 0.0], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]
        )

        log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat)
        self.assertTrue(log_loss != inf)

        y = turicreate.SArray([0, 1, 0, 0])
        yhat = turicreate.SArray([0.0, 0.9, 0.1, 0.1])
        log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat)
        self.assertTrue(log_loss != inf)

        y = turicreate.SArray([0, 1, 0, 0])
        yhat = turicreate.SArray([0.1, 1.0, 0.1, 0.1])
        log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat)
        self.assertTrue(log_loss != inf)
Beispiel #16
0
    def get_backgrounds(self):
        tarfile_path = _download_and_checksum_files(self.sarray_url_md5_pairs,
                                                    _get_cache_dir("data"))[0]
        backgrounds_tar = _tarfile.open(tarfile_path)
        try:
            backgrounds = _tc.SArray(self.destination_sarray_path)
        except:
            # delete the incompletely extracted tarball bits on disk
            if _os.path.exists(self.destination_sarray_path):
                _shutil.rmtree(self.destination_sarray_path)
            # and re-extract
            backgrounds_tar.extractall(_get_cache_dir("data"))
            backgrounds = _tc.SArray(self.destination_sarray_path)

        return backgrounds
Beispiel #17
0
    def test_integer_probabilities(self):

        y = turicreate.SArray([0, 1, 2, 0])
        yhat = turicreate.SArray([[1, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1]])

        log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat)
        auc = turicreate.toolkits.evaluation.auc(y, yhat)
        roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat)

        y = turicreate.SArray([0, 1, 0, 0])
        yhat = turicreate.SArray([0, 1, 0, 0])

        turicreate.toolkits.evaluation.log_loss(y, yhat)
        turicreate.toolkits.evaluation.auc(y, yhat)
        turicreate.toolkits.evaluation.roc_curve(y, yhat)
Beispiel #18
0
    def test_list_and_dict_type(self):
        accuracy_threshold = 0.8

        simple_data = self.data
        simple_train, simple_test = simple_data.random_split(0.8, seed=1)

        # make a more complicated dataset containing list and dictionary type columns
        complex_data = copy.copy(simple_data)
        complex_data['random_list_noise'] = \
            tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())])
        complex_data['random_dict_noise'] = \
            tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())])
        complex_train, complex_test = complex_data.random_split(0.8, seed=1)

        for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]:
            self._test_classifier_model(train, test, accuracy_threshold)
    def setUpClass(self):
        """
        Set up (Run only once)
    """

        np.random.seed(15)
        n, d = 100, 3
        self.sf = tc.SFrame()

        for i in range(d):
            self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True)
        self.sf["target"] = np.random.randint(2, size=n)
        self.target = "target"
        self.sf["vec"] = self.sf.apply(
            lambda row: [row["X{}".format(i + 1)] for i in range(d)])
        self.sf["vec"] = self.sf["vec"].apply(lambda x: x, array.array)

        self.features = ["vec"]
        self.unpacked_features = ["vec[%s]" % (i) for i in range(d)]
        self.def_kwargs = _DEFAULT_SOLVER_OPTIONS

        ## Compute the correct answers with Scikit-Learn
        target_name = self.target
        feature_names = self.features
        X_train = list(self.sf["vec"])
        y_train = list(self.sf[self.target])
        sm_model = svm.LinearSVC(C=1.0, loss="hinge")
        sm_model.fit(X_train, y_train)
        self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
Beispiel #20
0
    def setUpClass(self):
        """
        Set up (Run only once)
    """

        np.random.seed(15)
        n, d = 100, 3
        self.sf = tc.SFrame()

        for i in range(d):
            self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True)
        self.sf['target'] = np.random.randint(2, size=n)
        self.target = 'target'
        self.sf['dict'] = self.sf.apply(
            lambda row: {i: row['X{}'.format(i + 1)]
                         for i in range(d)})
        self.features = ['dict']
        self.unpacked_features = ['dict[%s]' % i for i in range(d)]
        self.def_kwargs = _DEFAULT_SOLVER_OPTIONS

        ## Compute the correct answers with Scikit-Learn
        target_name = self.target
        feature_names = self.features
        X_train = list(self.sf['dict'].apply(lambda x: [x[k] for k in \
          sorted(x.keys())]))
        y_train = list(self.sf[self.target])
        sm_model = svm.LinearSVC(C=1.0, loss='l1')
        sm_model.fit(X_train, y_train)
        self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
Beispiel #21
0
    def setUpClass(self):
        ## Simulate test data
        np.random.seed(10)
        n, d = 100, 10
        self.sf = tc.SFrame()
        for i in range(d):
            self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True)
        target = np.random.randint(2, size=n)

        ## Create the model
        self.sf['target'] = target
        self.def_kwargs = _DEFAULT_SOLVER_OPTIONS
        self.def_opts = dict(
            list(self.def_kwargs.items()) + list({
                'solver': 'auto',
                'feature_rescaling': True,
                'class_weights': None,
                'penalty': 1.0
            }.items()))

        self.solver = 'auto'
        self.opts = self.def_opts.copy()
        self.opts['max_iterations'] = 500
        self.features = ['X{}'.format(i) for i in range(1, d + 1)]
        self.unpacked_features = ['X{}'.format(i) for i in range(1, d + 1)]
        self.target = 'target'
Beispiel #22
0
    def setUpClass(self):
        """
        Setup required for all tests that don't require an trained model.
        """

        np.random.seed(8)
        n, d = 100, 10
        self.sf = tc.SFrame()

        for i in range(d):
            self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True)
        target = np.random.randint(2, size=n)
        target[0] = 0
        target[1] = 1

        ## Create the model
        self.sf['target'] = target
        self.def_kwargs = _DEFAULT_SOLVER_OPTIONS

        self.solver = 'auto'
        self.features = ', '.join(['X{}'.format(i) for i in range(1, d + 1)])
        self.target = 'target'

        self.sf['target'] = target
        self.features = ['X{}'.format(i) for i in range(1, d + 1)]

        ## Compute the correct answers with Scikit-Learns
        target_name = self.target
        feature_names = self.features
        X_train = list(self.sf.apply(lambda row: [row[k] for k in \
          feature_names]))
        y_train = list(self.sf[self.target])
        sm_model = svm.LinearSVC(C=1.0, loss='l1')
        sm_model.fit(X_train, y_train)
        self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
Beispiel #23
0
    def test_combination_gl_python_types(self):

        sg_test_1 = tc.SGraph().add_vertices([
            tc.Vertex(1, {'fluffy': 1}),
            tc.Vertex(2, {
                'fluffy': 1,
                'woof': 1
            }),
            tc.Vertex(3, {})
        ])
        sarray_test_1 = tc.SArray([1, 2, 3])
        sframe_test_1 = tc.SFrame([1, 2, 3])

        obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], {
            0: sg_test_1,
            1: sframe_test_1,
            2: sarray_test_1
        }]

        for obj in obj_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj[0].get_vertices(),
                                obj_ret[0].get_vertices())
            assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges())
            assert_sframe_equal(obj[1], obj_ret[1])
            assert list(obj[2]) == list(obj_ret[2])
Beispiel #24
0
    def test_custom_initial_centers(self):
        """
        Test that the user can pass hard-coded initial cluster centers, and
        that these are actually used to initialize the clusters.
        """

        ## Empty initial centers
        with self.assertRaises(ValueError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SFrame(),
                                 max_iterations=self.max_iter, verbose=False)

        ## Initial centers as an SArray of indices
        with self.assertRaises(TypeError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SArray([1, 2, 3]),
                                 max_iterations=self.max_iter, verbose=False)

        ## Initial centers with a schema that doesn't match the data
        sf_init = make_clustering_data(n=10, d=self.dim-1, seed=43)

        with self.assertRaises(ValueError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=sf_init,
                                 max_iterations=self.max_iter, verbose=False)

        ## Good initial centers
        sf_init = make_clustering_data(n=10, d=self.dim, seed=43)
        ftrs = ['float0', 'float1', 'dict0'] # exclude int feature because these *are* changed.

        m = tc.kmeans.create(self.sf, features=ftrs, initial_centers=sf_init,
                             max_iterations=0, verbose=False)

        model_init_centers = m.cluster_info
        assert_sframe_equal(sf_init[ftrs], model_init_centers[ftrs])
Beispiel #25
0
    def test_predict(self):
        """
        Test that we can make predictions using the model.
        """

        docs = self.docs
        for m in self.models:
            preds = m.predict(docs)
            self.assertTrue(isinstance(preds, turicreate.SArray))
            self.assertEqual(len(preds), len(docs))
            self.assertEqual(preds.dtype, int)

            preds = m.predict(docs, output_type='probability')
            self.assertTrue(isinstance(preds, turicreate.SArray))
            self.assertTrue(len(preds) == len(docs))
            s = preds.apply(lambda x: sum(x))
            self.assertTrue((s.apply(lambda x: abs(x - 1)) < .000001).all())

            # Test predictions when docs have new words
            new_docs = turicreate.SArray([{
                '-1,-1': 3.0,
                '0,4': 5.0,
                '0,3': 2.0
            }])
            preds = m.predict(new_docs)
            self.assertEqual(len(preds), len(new_docs))

            # Test additional burnin. Ideally we could show that things
            # converge as you increase burnin.
            preds_no_burnin = m.predict(docs,
                                        output_type='probability',
                                        num_burnin=0)
            self.assertEqual(len(preds_no_burnin), len(docs))
Beispiel #26
0
    def test_pickling_sarray_types(self):

        sarray_list = [
            tc.SArray([1, 2, 3]),
            tc.SArray([1.0, 2.0, 3.5]),
            tc.SArray(["foo", "bar"]),
        ]
        for obj in sarray_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert list(obj) ==  list(obj_ret), \
                       "Failed pickling in %s (Got back %s)" % (obj, obj_ret)
Beispiel #27
0
    def test_grouped_precision_recall(self):
        data = turicreate.SFrame()
        data['user_id'] = ["a", "b", "b", "c", "c", "c"]
        data['item_id'] = ['x', 'x', 'y', 'v', 'w', 'z']
        data['rating'] = [0, 1, 2, 3, 4, 5]
        m = turicreate.recommender.item_similarity_recommender.create(data)
        recs = m.recommend()

        test_data = turicreate.SFrame()
        test_data['user_id'] = ['a', 'b']
        test_data['item_id'] = ['v', 'z']
        test_data['rating'] = [7, 8]

        pr = turicreate.recommender.util.precision_recall_by_user(test_data,
                                                                  recs,
                                                                  cutoffs=[3])

        self.assertEqual(type(pr), turicreate.SFrame)
        self.assertEqual(pr.column_names(),
                         ['user_id', 'cutoff', 'precision', 'recall', 'count'])
        self.assertEqual(list(pr['user_id']),
                         list(turicreate.SArray(['a', 'b', 'c'])))
        pr = turicreate.recommender.util.precision_recall_by_user(
            test_data, recs, cutoffs=[5, 10, 15])
        self.assertEqual(pr.num_rows(), 9)
    def setUpClass(self):
        ## Simulate test data
        np.random.seed(10)
        n, d = 100, 10
        self.sf = tc.SFrame()
        for i in range(d):
            self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True)
        target = np.random.randint(2, size=n)

        ## Create the model
        self.sf["target"] = target
        self.def_kwargs = _DEFAULT_SOLVER_OPTIONS
        self.def_opts = dict(
            list(self.def_kwargs.items()) + list({
                "solver": "auto",
                "feature_rescaling": True,
                "class_weights": None,
                "penalty": 1.0,
            }.items()))

        self.solver = "auto"
        self.opts = self.def_opts.copy()
        self.opts["max_iterations"] = 500
        self.features = ["X{}".format(i) for i in range(1, d + 1)]
        self.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)]
        self.target = "target"
Beispiel #29
0
    def test_cat(self):
        import numpy as np

        # Arrange
        np.random.seed(8)
        n, d = 1000, 100
        sf = tc.SFrame()
        for i in range(d):
            sf.add_column(tc.SArray(np.random.rand(n)), inplace=True)
            target = np.random.randint(2, size=n)
            sf["target"] = target

        sf["target"] = sf["target"].astype(str)
        sf["target"] = "cat-" + sf["target"]
        model = tc.boosted_trees_classifier.create(sf, "target")

        # Act
        evaluation = model.evaluate(sf)

        # Assert
        self.assertEqual(
            ["cat-0", "cat-1"],
            sorted(
                list(evaluation["confusion_matrix"]["target_label"].unique())),
        )
 def test_invalid_data_set(self):
     # infer dtype str
     a = tc.SArray(["str", None])
     b = tc.SArray(["str", "str"])
     # target contains none
     sf = tc.SFrame({"a": a, "b": b})
     with self.assertRaises(ToolkitError):
         tc.text_classifier.create(
             sf, target="a", features=["b"], word_count_threshold=1
         )
     # feature contains none, Github #2402
     sf = tc.SFrame({"b": a, "a": b})
     with self.assertRaises(ToolkitError):
         tc.text_classifier.create(
             sf, target="b", features=["a"], word_count_threshold=1
         )