def setUp(self):
     super(TestPaiClassifiers, self).setUp()
     self.create_test_table()
     self.pai_context = PAIContext(self.odps)
class TestPaiClassifiers(TestBase):
    def setUp(self):
        super(TestPaiClassifiers, self).setUp()
        self.create_test_table()
        self.pai_context = PAIContext(self.odps)

    def tearDown(self):
        self.pai_context.cleanup()
        self.odps.run_sql("drop table if exists " + IONOSPHERE_TABLE)
        super(TestPaiClassifiers, self).tearDown()

    def create_test_table(self):
        fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint'
        self.odps.execute_sql("drop table if exists " + IONOSPHERE_TABLE)
        self.odps.execute_sql("create table %s (%s)" % (IONOSPHERE_TABLE, fields))

        upload_ss = self.tunnel.create_upload_session(IONOSPHERE_TABLE)
        writer = upload_ss.open_record_writer(0)

        for line in open(IONOSPHERE_FILE_PATH, 'r'):
            rec = upload_ss.new_record()
            cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))]
            [rec.set(i, val) for i, val in enumerate(cols)]
            writer.write(rec)
        writer.close()
        upload_ss.commit([0,])

    def test_mock_logistic_regression(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        # labeled_data = dataset.set_label_field("category")
        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        # predicted = model.predict(dataset)
        predicted.store_odps("testOut")

    def test_mock_xgboost(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        # labeled_data = dataset.set_label_field("category")
        xgboost = Xgboost(silent=1).set_eta(0.3)
        model = xgboost.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        # predicted = model.predict(dataset)
        predicted.store_odps("testOut")

    def test_random_forests(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        # labeled_data = dataset.set_label_field("category")
        rf = RandomForests(tree_num=10).set_max_tree_deep(10)
        model = rf.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        # predicted = model.predict(dataset)
        predicted.store_odps("testOut")

    def test_gbdt_lr(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        gbdt_lr = GBDTLR(tree_count=500).set_shrinkage(0.05)
        model = gbdt_lr.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        predicted.store_odps("testOut")

    def test_linear_svm(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        svm = LinearSVM(epsilon=0.001).set_cost(1)
        model = svm.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        predicted.store_odps("testOut")

    def test_naive_bayes(self):
        self.pai_context.set_config("execution.mock", True)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        naive_bayes = NaiveBayes()
        model = naive_bayes.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        predicted.store_odps("testOut")

    def test_logistic_regression(self):
        self.pai_context.set_config("execution.mock", False)

        dataset = self.pai_context.odps_data(IONOSPHERE_TABLE)
        splited = dataset.split(0.6)

        labeled_data = splited[0].set_label_field("class")
        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(labeled_data)
        model.store_odps("testOutModel")

        predicted = model.predict(splited[1])
        # store_odps is an operational node which will trigger execution of the flow
        predicted.store_odps("testOut")

        fpr, tpr, thresh = roc_curve(predicted, 1, "class")
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)