def test_tree_depth(self):
     self.create_tree_graph(TREE_GRAPH_EDGE_TABLE)
     tree_ds = DataFrame(self.odps.get_table(TREE_GRAPH_EDGE_TABLE)) \
         .roles(from_vertex='flow_out_id', to_vertex='flow_in_id')
     output = TreeDepth().transform(tree_ds)._add_case(self.gen_check_params_case(
         {'outputTableName': tn('pyodps_test_ml_tree_depth'), 'fromVertexCol': 'flow_out_id', 'workerMem': '4096',
          'inputEdgeTableName': tn('pyodps_test_ml_tree_graph_edge'), 'toVertexCol': 'flow_in_id',
          'splitSize': '64'}))
     output.persist(TREE_DEPTH_TABLE)
    def test_quantile(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        qt = quantile(df, _cases=self.gen_check_params_case(
            {'inputTableName': tn('pyodps_test_ml_ionosphere'), 'outputTableName': tn('pyodps_test_ml_iono_quantile'),
             'colName': ','.join('a%02d' % i for i in range(1, 35)),
             'N': '100'}))
        qt.persist(IONOSPHERE_QUANTILE_TABLE)
    def test_quantile(self):
        options.runner.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        qt = quantile(df, _cases=self.gen_check_params_case(
            {'inputTableName': tn('pyodps_test_ml_ionosphere'), 'outputTableName': tn('pyodps_test_ml_iono_quantile'),
             'colName': ','.join('a%02d' % i for i in range(1, 35)),
             'N': '100'}))
        qt.persist(IONOSPHERE_QUANTILE_TABLE)
 def test_doc2vec(self):
     word_df, doc_df, _ = Doc2Vec().transform(self.df)
     doc_df._add_case(self.gen_check_params_case(
         {'minCount': '5', 'docColName': 'content', 'hs': '1', 'inputTableName': tn('pyodps_test_ml_corpus'),
          'negative': '0', 'layerSize': '100', 'sample': '0', 'randomWindow': '1', 'window': '5',
          'docIdColName': 'id', 'iterTrain': '1', 'alpha': '0.025', 'cbow': '0',
          'outVocabularyTableName': 'tmp_pyodps__doc2_vec', 'outputWordTableName': 'tmp_pyodps__doc2_vec',
          'outputDocTableName': tn('pyodps_test_ml_doc2vec_doc_result')}))
     doc_df.persist(DOC2VEC_DOC_TABLE)
 def test_semantic_vector_distance(self):
     result_df = semantic_vector_distance(self.df)
     result_df._add_case(
         self.gen_check_params_case({
             'topN':
             '5',
             'outputTableName':
             tn('pyodps_test_ml_semantic_dist_result'),
             'distanceType':
             'euclidean',
             'inputTableName':
             tn('pyodps_test_ml_corpus')
         }))
     result_df.persist(SEMANTIC_DIST_TABLE)
    def test_mat_pearson(self):
        options.runner.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        matrix_pearson(df, _cases=self.gen_check_params_case(
            {'outputTableName': 'tmp_pyodps_ml_matrix_pearson_0_2_res',
             'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)),
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
    def test_chisquare(self):
        options.runner.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        chi_square(df, x_col=df.a01, y_col='class', _cases=self.gen_check_params_case(
            {'yColName': 'class', 'xColName': 'a01', 'outputDetailTableName': 'tmp_pyodps_ml_chi_square_0_1_res_2',
             'outputTableName': 'tmp_pyodps_ml_chi_square_0_1_res_1',
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
    def test_chisquare(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        chi_square(df, x_col=df.a01, y_col='class', _cases=self.gen_check_params_case(
            {'yColName': 'class', 'xColName': 'a01', 'outputDetailTableName': 'tmp_pyodps__chi_square',
             'outputTableName': 'tmp_pyodps__chi_square',
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
    def test_mat_pearson(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        matrix_pearson(df, _cases=self.gen_check_params_case(
            {'outputTableName': 'tmp_pyodps__matrix_pearson',
             'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)),
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
 def test_tree_depth(self):
     self.create_tree_graph(TREE_GRAPH_EDGE_TABLE)
     tree_ds = DataFrame(self.odps.get_table(TREE_GRAPH_EDGE_TABLE)) \
         .roles(from_vertex='flow_out_id', to_vertex='flow_in_id')
     output = TreeDepth().transform(tree_ds)._add_case(
         self.gen_check_params_case({
             'outputTableName':
             tn('pyodps_test_ml_tree_depth'),
             'fromVertexCol':
             'flow_out_id',
             'workerMem':
             '4096',
             'inputEdgeTableName':
             tn('pyodps_test_ml_tree_graph_edge'),
             'toVertexCol':
             'flow_in_id',
             'splitSize':
             '64'
         }))
     output.persist(TREE_DEPTH_TABLE)
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

import logging

from odps.df import DataFrame
from odps.ml.classifiers import LogisticRegression
from odps.ml.cross_validation import cross_val_score
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')


class TestCrossValidation(MLTestBase):
    def setUp(self):
        super(TestCrossValidation, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')

    def tearDown(self):
        super(TestCrossValidation, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        print(cross_val_score(lr, self.df))
from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.regression import *
from odps.ml.feature import *
from odps.ml.statistics import *
from odps.ml.tests.base import MLTestBase, tn, otm, ci_skip_case
from odps.ml.metrics import *

import logging
logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
XGBOOST_OUT_TABLE = tn('pyodps_test_xgboost_out')
GBDT_OUT_TABLE = tn('pyodps_test_gbdt_out')
LINEAR_REGRESSION_OUT_TABLE = tn('pyodps_test_linear_reg_out')
LINEAR_SVR_OUT_TABLE = tn('pyodps_test_linear_svr_out')
LASSO_OUT_TABLE = tn('pyodps_test_lasso_out')
RIDGE_OUT_TABLE = tn('pyodps_test_ridge_out')

MODEL_NAME = tn('pyodps_test_out_model')


class TestMLRegression(MLTestBase):
    def setUp(self):
        super(TestMLRegression, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import textwrap

from odps import options
from odps.df import DataFrame
from odps.ml.tests.base import MLTestBase, tn

IRIS_TABLE = tn('pyodps_test_ml_iris')
TEMP_TABLE_1_NAME = tn('pyodps_test_mixin_test_table1')
TEMP_TABLE_2_NAME = tn('pyodps_test_mixin_test_table2')


def _df_roles(df):
    return dict(
        (f.name, ','.join(r.name for r in f.role)) for f in df._ml_fields)


def _df_continuity(df):
    return dict((f.name, f.continuity.name) for f in df._ml_fields)


def _df_key_value(df):
    return dict((f.name, repr(f.kv_config) if f.kv_config else '')
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import textwrap

from odps import options
from odps.df import DataFrame
from odps.runner import adapter_from_df
from odps.ml.adapter import merge_data
from odps.ml.tests.base import MLTestBase, tn

IRIS_TABLE = tn("pyodps_test_ml_iris")
TEMP_TABLE_1_NAME = tn("pyodps_test_mixin_test_table1")
TEMP_TABLE_2_NAME = tn("pyodps_test_mixin_test_table2")


def _df_roles(df):
    return dict((f.name, ",".join(r.name for r in f.role)) for f in adapter_from_df(df).fields)


def _df_continuity(df):
    return dict((f.name, f.continuity.name) for f in adapter_from_df(df).fields)


def _df_key_value(df):
    return dict((f.name, repr(f.kv_config) if f.kv_config else "") for f in adapter_from_df(df).fields)
Beispiel #15
0
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json

from odps import utils
from odps.ml import utils as ml_utils
from odps.ml.tests.base import MLTestBase, tn

TEST_LR_MODEL_NAME = tn('pyodps_test_lr_model')
TEST_TABLE_MODEL_NAME = tn('pyodps_table_model')
TEST_TEMP_TABLE_MODEL_NAME = tn(utils.TEMP_TABLE_PREFIX + 'table_model')
IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')


class Test(MLTestBase):
    def testNonTemp(self):
        model_comment = dict(key='value')

        model_table_name1 = ml_utils.build_model_table_name(TEST_TABLE_MODEL_NAME, 'st1')
        self.odps.execute_sql('drop table if exists {0}'.format(model_table_name1))
        self.odps.execute_sql('create table if not exists {0} (col1 string) comment \'{1}\' lifecycle 1'.format(
            model_table_name1, utils.escape_odps_string(json.dumps(model_comment))
        ))
        model_table_name2 = ml_utils.build_model_table_name(TEST_TABLE_MODEL_NAME, 'st2')
Beispiel #16
0
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from odps import options, DataFrame
from odps.ml.feature import *
from odps.ml.expr import PmmlModel
from odps.ml.tests.base import MLTestBase, tn

TEST_LR_MODEL_NAME = tn('pyodps_test_lr_model')
IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
SELECT_FEATURE_OUTPUT_TABLE = tn('pyodps_test_ml_select_feature_output')


class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.create_test_pmml_model(TEST_LR_MODEL_NAME)
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).label_field('class')
        self.model = PmmlModel(_source_data=self.odps.get_offline_model(TEST_LR_MODEL_NAME))
        options.ml.dry_run = True

    def test_rf_importance(self):
        rf_importance(self.df, self.model, core_num=1, core_mem=1024, _cases=self.gen_check_params_case({
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.text import *
from odps.ml.tests.base import MLTestBase, tn

CORPUS_TABLE = tn('pyodps_test_ml_corpus')
WORD_TRIPLE_TABLE = tn('pyodps_test_ml_word_triple')
SPLITED_TABLE = tn('pyodps_test_ml_splited_text')
NOISE_TABLE = tn('pyodps_test_ml_noises')
W2V_TABLE = tn('pyodps_test_ml_w2v')
TFIDF_TABLE = tn('pyodps_test_ml_tf_idf')
LDA_TABLE = tn('pyodps_test_ml_plda')
STR_COMP_TABLE = tn('pyodps_test_ml_str_comp')
COMP_RESULT_TABLE = tn('pyodps_test_ml_str_comp_result')
TOP_N_TABLE = tn('pyodps_test_ml_top_n_result')
FILTERED_WORDS_TABLE = tn('pyodps_test_ml_filtered_words_result')
KW_EXTRACTED_TABLE = tn('pyodps_test_ml_kw_extracted_result')
TEXT_SUMMARIZED_TABLE = tn('pyodps_test_ml_text_summarized_result')
COUNT_NGRAM_TABLE = tn('pyodps_test_ml_count_ngram_result')

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.clustering import *
from odps.ml.metrics import *
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

import logging
logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IONOSPHERE_CLUSTER_LABEL_TABLE = tn('pyodps_test_ml_iono_cluster_label')
IONOSPHERE_CLUSTER_MODEL = tn('pyodps_test_ml_kmeans_model')


class TestMLClustering(MLTestBase):
    def setUp(self):
        super(TestMLClustering, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)

    @ci_skip_case
    def test_kmeans(self):
        self.delete_table(IONOSPHERE_CLUSTER_LABEL_TABLE)
        self.delete_offline_model(IONOSPHERE_CLUSTER_MODEL)
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        labeled, model = KMeans(center_count=3).transform(
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.text import *
from odps.ml.tests.base import MLTestBase, tn

CORPUS_TABLE = tn('pyodps_test_ml_corpus')
WORD_TRIPLE_TABLE = tn('pyodps_test_ml_word_triple')
SPLITED_TABLE = tn('pyodps_test_ml_splited_text')
NOISE_TABLE = tn('pyodps_test_ml_noises')
W2V_TABLE = tn('pyodps_test_ml_w2v')
TFIDF_TABLE = tn('pyodps_test_ml_tf_idf')
LDA_TABLE = tn('pyodps_test_ml_plda')
STR_COMP_TABLE = tn('pyodps_test_ml_str_comp')
COMP_RESULT_TABLE = tn('pyodps_test_ml_str_comp_result')
TOP_N_TABLE = tn('pyodps_test_ml_top_n_result')
FILTERED_WORDS_TABLE = tn('pyodps_test_ml_filtered_words_result')
KW_EXTRACTED_TABLE = tn('pyodps_test_ml_kw_extracted_result')
TEXT_SUMMARIZED_TABLE = tn('pyodps_test_ml_text_summarized_result')
COUNT_NGRAM_TABLE = tn('pyodps_test_ml_count_ngram_result')
DOC2VEC_DOC_TABLE = tn('pyodps_test_ml_doc2vec_doc_result')
SEMANTIC_DIST_TABLE = tn('pyodps_test_ml_semantic_dist_result')
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.statistics import *
from odps.ml.tests.base import MLTestBase, tn

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IRIS_TABLE = tn('pyodps_test_ml_iris')
IONOSPHERE_PRINCOMP_TABLE = tn('pyodps_test_ml_iono_princomp')
IONOSPHERE_FEATURE_STATS = tn('pyodps_test_ml_iono_feature_stats')
IONOSPHERE_REPLACE_WOE = tn('pyodps_test_ml_iono_replace_woe')
IONOSPHERE_QUANTILE_TABLE = tn('pyodps_test_ml_iono_quantile')


class TestStatistics(MLTestBase):
    def setUp(self):
        super(TestStatistics, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)

    def test_histograms(self):
        options.ml.dry_run = True
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

from odps.config import options
from odps.df import DataFrame
from odps.ml.recommend import *
from odps.ml.tests.base import MLTestBase, tn

USER_ITEM_TABLE = tn("pyodps_test_ml_user_item_table")
USER_ITEM_PAYLOAD_TABLE = tn("pyodps_test_ml_user_item_payload_table")
ASSOC_RESULT_TABLE = tn("pyodps_test_ml_assoc_result")
ETREC_RESULT_TABLE = tn("pyodps_test_ml_etrec_result")
ALSCF_RESULT_TABLE = tn("pyodps_test_ml_als_cf_result")
ALSCF_RECOMMEND_TABLE = tn("pyodps_test_ml_als_cf_rec")
SVDCF_RESULT_TABLE = tn("pyodps_test_ml_svd_cf_result")
SVDCF_RECOMMEND_TABLE = tn("pyodps_test_ml_svd_cf_rec")


class TestRecommend(MLTestBase):
    def setUp(self):
        super(TestRecommend, self).setUp()
        options.runner.dry_run = True

    def test_etrec(self):
# under the License.

from __future__ import print_function

import logging

from odps.config import options
from odps.df import DataFrame
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.classifiers import *
from odps.ml.preprocess import *
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

logger = logging.getLogger(__name__)

IONOSPHERE_TABLE_ONE_PART = tn(TEMP_TABLE_PREFIX + 'ionosphere_one_part')
IONOSPHERE_TABLE_TWO_PARTS = tn(TEMP_TABLE_PREFIX + 'ionosphere_two_parts')
IONOSPHERE_NORMALIZED_TABLE = tn(TEMP_TABLE_PREFIX + 'iono_normalized_part')
TEST_OUTPUT_TABLE_NAME = tn(TEMP_TABLE_PREFIX + 'out_parted')

MODEL_NAME = tn('pyodps_test_out_model')


class TestPartitions(MLTestBase):
    def setUp(self):
        super(TestPartitions, self).setUp()

    def tearDown(self):
        super(TestPartitions, self).tearDown()

    def test_logistic_one_part_input(self):
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import json

from odps import utils
from odps.ml import utils as ml_utils
from odps.ml.models import TablesModel, PmmlModel, list_tables_model
from odps.ml.tests.base import MLTestBase, tn

TEST_LR_MODEL_NAME = tn("pyodps_test_lr_model")
TEST_TABLE_MODEL_NAME = tn("pyodps_table_model")
IONOSPHERE_TABLE = tn("pyodps_test_ml_ionosphere")


class TestBaseModel(MLTestBase):
    def test_odps_model(self):
        self.create_test_pmml_model(TEST_LR_MODEL_NAME)
        model = PmmlModel(self.odps.get_offline_model(TEST_LR_MODEL_NAME))
        self.assertEqual(model._bind_node.code_name, "pmml_input")
        self.assertEqual(model._bind_node.parameters["modelName"], TEST_LR_MODEL_NAME)

    def test_tables_model(self):
        model_comment = dict(className="odps.ml.models.TablesModel", key="value")

        model_table_name1 = "".join(
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from collections import namedtuple

from odps.df import DataFrame
from odps.ml.text import *
from odps.ml.classifiers import *
from odps.ml.pipeline import Pipeline, FeatureUnion
from odps.ml.pipeline.core import PipelineStep
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

CORPUS_TABLE = tn('pyodps_test_ml_corpus')
W2V_TABLE = tn('pyodps_test_ml_w2v')
TFIDF_TABLE = tn('pyodps_test_ml_tf_idf')
LDA_TABLE = tn('pyodps_test_ml_plda')

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IONOSPHERE_LR_MODEL = tn('pyodps_test_out_model')


class MockTransformStep(PipelineStep):
    def __init__(self,
                 test_cls,
                 step_name,
                 action=None,
                 params=None,
                 outputs=None):
# limitations under the License.

from __future__ import print_function

import logging

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.classifiers import *
from odps.ml.metrics.classification import roc_curve
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

logger = logging.getLogger(__name__)

IRIS_KV_TABLE = tn('pyodps_test_ml_iris_sparse')

LR_TEST_TABLE = tn('pyodps_lr_output_table')
XGBOOST_TEST_TABLE = tn('pyodps_xgboost_output_table')

MODEL_NAME = tn('pyodps_test_out_model')


class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import logging

from odps.config import options
from odps.df import DataFrame
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.classifiers import *
from odps.ml.tests.base import MLTestBase, tn

logger = logging.getLogger(__name__)

IONOSPHERE_TABLE_ONE_PART = tn(TEMP_TABLE_PREFIX + 'ionosphere_one_part')
IONOSPHERE_TABLE_TWO_PARTS = tn(TEMP_TABLE_PREFIX + 'ionosphere_two_parts')
TEST_OUTPUT_TABLE_NAME = tn(TEMP_TABLE_PREFIX + 'out_parted')

MODEL_NAME = tn('pyodps_test_out_model')


class TestPartitions(MLTestBase):
    def setUp(self):
        super(TestPartitions, self).setUp()

    def tearDown(self):
        super(TestPartitions, self).tearDown()

    def test_logistic_one_part_input(self):
        options.ml.dry_run = True
from __future__ import print_function

import logging

from odps.config import options
from odps.df import DataFrame
from odps.ml.classifiers import *
from odps.ml.feature import *
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.metrics import roc_curve, roc_auc_score, confusion_matrix
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')

LR_TEST_TABLE = tn('pyodps_lr_output_table')
XGBOOST_TEST_TABLE = tn('pyodps_xgboost_output_table')
RANDOM_FORESTS_TEST_TABLE = tn('pyodps_random_forests_output_table')
GBDT_LR_TEST_TABLE = tn('pyodps_gbdt_lr_output_table')
LINEAR_SVM_TEST_TABLE = tn('pyodps_linear_svm_output_table')
NAIVE_BAYES_TEST_TABLE = tn('pyodps_naive_bayes_output_table')
KNN_TEST_TABLE = tn('pyodps_knn_output_table')

MODEL_NAME = tn('pyodps_test_out_model')


class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import functools

from odps.df import DataFrame
from odps.df.expr.expressions import CollectionExpr
from odps.ml.tests.base import MLTestBase, tn
from odps.runner import DFAdapter, adapter_from_df, PartitionSelection
from odps.ml.adapter.op import *
from odps.ml.utils import KVConfig

TEMP_TABLE_1_NAME = tn('pyodps_test_ops_test_table1')
TEMP_TABLE_2_NAME = tn('pyodps_test_ops_test_table2')


class TestOp(MLTestBase):
    def test_base_methods(self):
        fields = [MLField('f%02d' % fid, 'string', FieldRole.FEATURE) for fid in range(5)]
        fields_set_singleton = list(DFAdapterOperation._set_singleton_role(fields, {'f00': FieldRole.WEIGHT}))
        self.assertSetEqual(fields_set_singleton[0].role, set([FieldRole.FEATURE, FieldRole.WEIGHT]))

        fields_set_singleton2 = list(DFAdapterOperation._set_singleton_role(fields_set_singleton, {'f01': FieldRole.WEIGHT}))
        self.assertSetEqual(fields_set_singleton2[0].role, set([FieldRole.FEATURE, ]))
        self.assertSetEqual(fields_set_singleton2[1].role, set([FieldRole.FEATURE, FieldRole.WEIGHT]))

        fields_set_singleton_expect = list(DFAdapterOperation._set_singleton_role(fields_set_singleton2,
                                                                                {'category': FieldRole.LABEL}))
Beispiel #29
0
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.ml.classifiers import LogisticRegression
from odps.ml.cross_validation import cross_val_score
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')


class TestCrossValidation(MLTestBase):
    def setUp(self):
        super(TestCrossValidation, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')

    def tearDown(self):
        super(TestCrossValidation, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        print(cross_val_score(lr, self.df))
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.statistics import *
from odps.ml.tests.base import MLTestBase, tn

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IRIS_TABLE = tn('pyodps_test_ml_iris')
IONOSPHERE_PRINCOMP_TABLE = tn('pyodps_test_ml_iono_princomp')
IONOSPHERE_FEATURE_STATS = tn('pyodps_test_ml_iono_feature_stats')
IONOSPHERE_REPLACE_WOE = tn('pyodps_test_ml_iono_replace_woe')
IONOSPHERE_QUANTILE_TABLE = tn('pyodps_test_ml_iono_quantile')


class TestStatistics(MLTestBase):
    def setUp(self):
        super(TestStatistics, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)

    def test_histograms(self):
        options.runner.dry_run = True
# under the License.

from __future__ import print_function

import logging

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.classifiers import *
from odps.ml.metrics.classification import roc_curve
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

logger = logging.getLogger(__name__)

IRIS_KV_TABLE = tn('pyodps_test_ml_iris_sparse')

LR_TEST_TABLE = tn('pyodps_lr_output_table')
XGBOOST_TEST_TABLE = tn('pyodps_xgboost_output_table')

MODEL_NAME = tn('pyodps_test_out_model')


class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.config import options
from odps.ml.clustering import *
from odps.ml.metrics import *
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

import logging
logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IONOSPHERE_CLUSTER_LABEL_TABLE = tn('pyodps_test_ml_iono_cluster_label')
IONOSPHERE_CLUSTER_MODEL = tn('pyodps_test_ml_kmeans_model')


class TestMLClustering(MLTestBase):
    def setUp(self):
        super(TestMLClustering, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)

    @ci_skip_case
    def test_kmeans(self):
        self.delete_table(IONOSPHERE_CLUSTER_LABEL_TABLE)
        self.delete_offline_model(IONOSPHERE_CLUSTER_MODEL)
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        labeled, model = KMeans(center_count=3).transform(df.exclude_fields('class'))
Beispiel #33
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools

from odps.df.expr.expressions import CollectionExpr
from odps.df.expr.tests.core import MockTable
from odps.df.types import validate_data_type
from odps.ml.expr.op import *
from odps.ml.tests.base import MLTestBase, tn
from odps.ml.utils import KVConfig
from odps.models.table import TableSchema as Schema

TEMP_TABLE_1_NAME = tn('pyodps_test_ops_test_table1')
TEMP_TABLE_2_NAME = tn('pyodps_test_ops_test_table2')

datatypes = lambda *types: [validate_data_type(t) for t in types]


class TestOp(MLTestBase):
    def testBaseMethods(self):
        fields = [
            MLField('f%02d' % fid, 'string', FieldRole.FEATURE)
            for fid in range(5)
        ]
        fields_set_singleton = list(
            DFOperation._set_singleton_role(fields, {'f00': FieldRole.WEIGHT}))
        self.assertSetEqual(fields_set_singleton[0].role,
                            set([FieldRole.FEATURE, FieldRole.WEIGHT]))
Beispiel #34
0
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from odps.df import DataFrame
from odps.ml import merge_data
from odps.ml.preprocess import *
from odps.ml.tests.base import MLTestBase, tn, ci_skip_case

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
IONOSPHERE_RANDOM_SAMPLE_TABLE = tn('pyodps_test_ml_iono_rand_sample')
IONOSPHERE_WEIGHTED_SAMPLE_TABLE = tn('pyodps_test_ml_iono_weight_sample')
IONOSPHERE_APPEND_ID_TABLE = tn('pyodps_test_ml_iono_append_id')
IONOSPHERE_MERGED_TABLE = tn('pyodps_test_ml_iono_merged')
IONOSPHERE_PRINCOMP_TABLE = tn('pyodps_test_ml_iono_princomp')
IONOSPHERE_ABNORMAL_TABLE = tn('pyodps_test_ml_iono_abnormal')
USER_ITEM_TABLE = tn('pyodps_test_ml_user_item')
USER_ITEM_UNPIVOT_TABLE = tn('pyodps_test_ml_unpivot_user_item')


class TestPreprocess(MLTestBase):
    def setUp(self):
        super(TestPreprocess, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

import logging

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.network import *
from odps.ml.tests.base import MLTestBase, tn

logger = logging.getLogger(__name__)

WEIGHTED_GRAPH_EDGE_TABLE = tn('pyodps_test_ml_weighted_graph_edge')
WEIGHTED_GRAPH_VERTEX_TABLE = tn('pyodps_test_ml_weighted_graph_node')
TREE_GRAPH_EDGE_TABLE = tn('pyodps_test_ml_tree_graph_edge')

NODE_DENSITY_TABLE = tn('pyodps_test_ml_node_density')
EDGE_DENSITY_TABLE = tn('pyodps_test_ml_edge_density')
MAXIMAL_CONNECTED_TABLE = tn('pyodps_test_ml_maximal_connected')
TRIANGLE_COUNT_TABLE = tn('pyodps_test_ml_triangle_count')
PAGE_RANK_TABLE = tn('pyodps_test_ml_page_rank')
LABEL_PROPAGATION_TABLE = tn('pyodps_test_ml_label_prop')
K_CORE_TABLE = tn('pyodps_test_ml__k_core')
SSSP_TABLE = tn('pyodps_test_ml_sssp')
TREE_DEPTH_TABLE = tn('pyodps_test_ml_tree_depth')


class Test(MLTestBase):
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import logging

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.network import *
from odps.ml.tests.base import MLTestBase, tn

logger = logging.getLogger(__name__)

WEIGHTED_GRAPH_EDGE_TABLE = tn('pyodps_test_ml_weighted_graph_edge')
WEIGHTED_GRAPH_VERTEX_TABLE = tn('pyodps_test_ml_weighted_graph_node')
TREE_GRAPH_EDGE_TABLE = tn('pyodps_test_ml_tree_graph_edge')

NODE_DENSITY_TABLE = tn('pyodps_test_ml_node_density')
EDGE_DENSITY_TABLE = tn('pyodps_test_ml_edge_density')
MAXIMAL_CONNECTED_TABLE = tn('pyodps_test_ml_maximal_connected')
TRIANGLE_COUNT_TABLE = tn('pyodps_test_ml_triangle_count')
PAGE_RANK_TABLE = tn('pyodps_test_ml_page_rank')
LABEL_PROPAGATION_TABLE = tn('pyodps_test_ml_label_prop')
K_CORE_TABLE = tn('pyodps_test_ml__k_core')
SSSP_TABLE = tn('pyodps_test_ml_sssp')
TREE_DEPTH_TABLE = tn('pyodps_test_ml_tree_depth')


class Test(MLTestBase):
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from odps import options, DataFrame
from odps.ml import PmmlModel
from odps.ml.feature import *
from odps.ml.tests.base import MLTestBase, tn

TEST_LR_MODEL_NAME = tn('pyodps_test_lr_model')
IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')
SELECT_FEATURE_OUTPUT_TABLE = tn('pyodps_test_ml_select_feature_output')


class Test(MLTestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.create_test_pmml_model(TEST_LR_MODEL_NAME)
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).label_field('class')
        self.model = PmmlModel(self.odps.get_offline_model(TEST_LR_MODEL_NAME))
        options.runner.dry_run = True

    def test_rf_importance(self):
        rf_importance(self.df, self.model, core_num=1, core_mem=1024, _cases=self.gen_check_params_case({
Beispiel #38
0
from __future__ import print_function

import logging
import sys

from odps.df import DataFrame
from odps.config import options
from odps.ml.utils import TEMP_TABLE_PREFIX
from odps.ml.algolib import *
from odps.ml.algolib.loader import load_classifiers
from odps.ml.tests.base import MLTestBase, tn

logger = logging.getLogger(__name__)

IONOSPHERE_TABLE = tn('pyodps_test_ml_ionosphere')

MODEL_NAME = tn('pyodps_test_out_model')


class TestAlgoBuild(MLTestBase):
    def setUp(self):
        super(TestAlgoBuild, self).setUp()
        self.create_ionosphere(IONOSPHERE_TABLE)
        self.register_algorithm()

    def tearDown(self):
        super(TestAlgoBuild, self).tearDown()

    def register_algorithm(self):
        algo_def = XflowAlgorithmDef('MyNaiveBayes',