def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected))
def testCacheTable(self): df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine._compile_dag(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = df._cache_data self.assertEqual(len(df.execute()), len(expected)) self.assertIs(df._cache_data, table) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(df4._cache_data, 2)
def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected))
def testPandasGroupbyFilter(self): import pandas as pd data = [ [2001, 1], [2002, 2], [2003, 3] ] df = DataFrame(pd.DataFrame(data, columns=['id', 'fid'])) df2 = df.groupby('id').agg(df.fid.sum()) df3 = df2[df2.id == 2003] expected = [ [2003, 3] ] self.assertEqual(df3.execute().values.values.tolist(), expected) df2 = df.groupby('id').agg(df.fid.sum()) df2.execute() self.assertIsNotNone(df2._cache_data) df3 = df2[df2.id == 2003] self.assertEqual(df3.execute().values.values.tolist(), expected) self.assertEqual(df3.execute().values.values.tolist(), expected) df4 = df.fid.sum() self.assertEqual(df4.execute(), 6) self.assertEqual(df4.execute(), 6)
def test_normalize(self): self.delete_table(IONOSPHERE_NORMALIZED_TABLE) self.delete_table(IONOSPHERE_TABLE_ONE_PART) self.create_ionosphere_one_part(IONOSPHERE_TABLE_ONE_PART) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_ONE_PART)).filter_partition('part=0, part=1') normalize(df.exclude_fields('class')).persist(IONOSPHERE_NORMALIZED_TABLE)
def test_df_store(self): self.delete_table(IONOSPHERE_SORTED_TABLE_PART) self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2') drop_table(self.odps, IONOSPHERE_SORTED_TABLE_PART, async=False) sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False) sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
def test_batch_persist(self): options.runner.dry_run = False call_seq = [] dfs = [] tables = [] for idx in range(3): write_str = "F%d" % idx def gen_fun(wobj): return lambda _: call_seq.append(wobj) f = gen_fun((write_str, "U")) df_upper = self.mock_action(self.df, action=f) f = gen_fun((write_str, "D")) df_lower = self.mock_action(df_upper, action=f) dfs.append(df_lower) tables.append("TN" + str(idx)) DataFrame.batch_persist(dfs, tables) for idx in range(3): write_str = "F%d" % idx self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD")) for dir in "UD": self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])
def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name, notify=False) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name, notify=False) html_notify('Persist succeeded')
def test_direct_method(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def test_kmeans(self): self.delete_table(IONOSPHERE_CLUSTER_LABEL_TABLE) self.delete_offline_model(IONOSPHERE_CLUSTER_MODEL) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) labeled, model = KMeans(center_count=3).transform(df.exclude_fields('class')) model.persist(IONOSPHERE_CLUSTER_MODEL, delay=True) pmml = model.load_pmml() print(pmml) eresult = calinhara_score(labeled, model) print(eresult)
def testHeadAndTail(self): df = DataFrame(self.table) self.assertEqual(1, len(df.head(1))) self.assertEqual(2, len(df.head(2))) self.assertEqual([3, 'name3'], list(df.tail(1)[0])) r = df[df.name == 'name2'].head(1) self.assertEqual(1, len(r)) self.assertEqual([2, 'name2'], list(r[0]))
def test_df_consecutive(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df.roles(label='class') df.head(10) train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def testHeadAndTail(self): df = DataFrame(self.table) self.assertEqual(1, len(df.head(1))) self.assertEqual(2, len(df.head(2))) self.assertEqual([3, 'name3'], list(df.tail(1)[0])) r = df[df.name == 'name2'].head(1) self.assertEqual(1, len(r)) self.assertEqual([2, 'name2'], list(r[0])) self.assertRaises(NotImplementedError, lambda: df[df.name == 'name2'].tail(1))
def test_mock_kmeans(self): options.runner.dry_run = True self.maxDiff = None df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) labeled, model = KMeans(center_count=3).transform(df.exclude_fields('class')) labeled._add_case(self.gen_check_params_case( {'inputTableName': IONOSPHERE_TABLE, 'centerCount': '3', 'distanceType': 'euclidean', 'idxTableName': IONOSPHERE_CLUSTER_LABEL_TABLE, 'initCentersMethod': 'sample', 'modelName': 'pm_k_means_0_2', 'appendColsIndex': ','.join('%d' % i for i in range(0, 35)), 'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'loop': '100', 'accuracy': '0.0'})) labeled.persist(IONOSPHERE_CLUSTER_LABEL_TABLE)
def testPandasPersist(self): import pandas as pd, numpy as np self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) self.odps.delete_table(tmp_table_name)
def test_df_combined(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class'] df = df.roles(label='class') df = df[df.a05 != 0].cache() df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + ( (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename( 't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")] ).sort(["name", "id"]) expr = expr[expr["name"].isin(self.pd_df["name"])] result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union(df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]).sort( ["name", "id"] ) test_expr = test_expr[test_expr["name"].isin(self.pd_df["name"])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected))
def testExtractKV(self): data = [ ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"], ["name1", "", "3=1,4=2"], ["name1", "k1=7.1,k7=8.2", "1=1,5=6"], ["name2", "k2=1.2,k3=1.5", None], ["name2", "k9=1.1,k2=1", "4=2"], ] table_name = tn("pyodps_test_mixed_engine_extract_kv") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"]) ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=") res = self.engine.execute(expr1) result = self._get_result(res) expected_cols = [ "name", "kv_k1", "kv_k2", "kv_k3", "kv_k5", "kv_k7", "kv_k9", "kv2_1", "kv2_2", "kv2_3", "kv2_4", "kv2_5", ] expected = [ ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None], ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None], ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0], ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None], ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None], ] self.assertListEqual([c.name for c in res.columns], expected_cols) self.assertEqual(result, expected) finally: table.drop()
class TestSparseClassifiers(MLTestBase): def setUp(self): super(TestSparseClassifiers, self).setUp() self.create_iris_kv(IRIS_KV_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content') def tearDown(self): super(TestSparseClassifiers, self).tearDown() @ci_skip_case def test_logistic_regression(self): options.runner.dry_run = False self.delete_table(LR_TEST_TABLE) self.delete_offline_model(MODEL_NAME) splited = self.df.split(0.6) lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(splited[0]) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LR_TEST_TABLE) fpr, tpr, thresh = roc_curve(predicted, "category") assert len(fpr) == len(tpr) and len(thresh) == len(fpr) def test_mock_xgboost(self): options.runner.dry_run = True splited = self.df.split(0.6) lr = Xgboost() model = lr.train(splited[0])._add_case(self.gen_check_params_case( {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic', 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case( {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true', 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'})) # persist operational node which will trigger execution of the flow predicted.persist(XGBOOST_TEST_TABLE)
def test_mock_xgboost(self): df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) xgboost = Xgboost() model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1', 'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({ 'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'})) # persist is an operational node which will trigger execution of the flow predicted.persist(XGBOOST_OUT_TABLE)
def test_mock_gbdt(self): df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) gbdt = GBDT(min_leaf_sample_count=10) model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({ 'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32', 'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0', 'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2', 'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'})) model.persist(MODEL_NAME) predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({ 'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class', 'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'})) # persist is an operational node which will trigger execution of the flow predicted.persist(GBDT_OUT_TABLE)
def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) expr = expr[expr['name'].isin(self.pd_df['name'])] expr = expr[expr, func.rand(rtype='float').rename('rand')] result = self.engine.execute(expr).values[['name', 'id']] df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union( df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected))
def testUnicodePdDataFrame(self): import pandas as pd pd_df = pd.DataFrame([['中文'], [to_text('中文2')]], columns=[to_text('字段')]) df = DataFrame(pd_df) r = df['字段'].execute() self.assertEqual(to_text('中文'), to_text(r[0][0])) self.assertEqual(to_text('中文2'), to_text(r[1][0]))
def test_keywords_extraction(self): self.odps.delete_table(KW_EXTRACTED_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) df = DataFrame(self.odps.get_table(SPLITED_TABLE)).roles(doc_id='doc_id', doc_content='content') extracted = extract_keywords(df) extracted._add_case(self.gen_check_params_case( {'dumpingFactor': '0.85', 'inputTableName': SPLITED_TABLE, 'epsilon': '0.000001', 'windowSize': '2', 'topN': '5', 'outputTableName': KW_EXTRACTED_TABLE, 'docIdCol': 'doc_id', 'maxIter': '100', 'docContent': 'content'})) extracted.persist(KW_EXTRACTED_TABLE)
def testToPandas(self): table_name = tn('pyodps_test_mixed_engine_to_pandas') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['col%s' % i for i in range(7)], ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'datetime'])) expr2 = DataFrame(table2) data2 = [ [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None] ] self.odps.write_table(table2, 0, data2) pd_df = expr2.to_pandas() self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist()) wrapeed_pd_df = expr2.to_pandas(wrap=True) self.assertSequenceEqual(data2[0], list(next(wrapeed_pd_df.execute())))
def test_top_n(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) top_n_df = top_n_similarity(df, df, col='col1', map_col='col1') top_n_df._add_case(self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output', 'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1', 'outputTableName': TOP_N_TABLE, 'mapTableName': self.odps.project + '.' + STR_COMP_TABLE, 'method': 'levenshtein_sim', 'lambda': '0.5'})) top_n_df.persist(TOP_N_TABLE)
def setUp(self): super(Test, self).setUp() self.create_weighted_graph_edges(WEIGHTED_GRAPH_EDGE_TABLE) self.create_weighted_graph_vertices(WEIGHTED_GRAPH_VERTEX_TABLE) self.vertex_df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_VERTEX_TABLE)) \ .roles(vertex_label='label', vertex_weight='node_weight').vertex_id_field('node') self.edge_df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE)) \ .roles(from_vertex='flow_out_id', to_vertex='flow_in_id', edge_weight='edge_weight') options.runner.dry_run = True
def test_df_combined(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class'] df = df.roles(label='class') df = df[df.a05 != 0].cache() df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) (-1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + ((1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2') ).rename('t3').sum() / predicted.prediction_score.count()).rename('t4').execute()
def testSparseVectorToMars(self): import pandas as pd import numpy as np shape = (50, ) data = np.random.rand(*shape) kv = [(i, data[i]) for i in range(shape[0])] pdf = pd.DataFrame(kv, columns=['i', 'v']) df = DataFrame(pdf).persist(tn('test_vector_to_mars'), lifecycle=1, odps=self.odps) oss_access_id, oss_secret_access_key, oss_bucket_name, oss_endpoint = self.config.oss t = df.to_mars_tensor_via_oss(['i'], 'v', 15, oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_vector_to_mars'), shape=shape, sparse=True) table_name = tn('test_vector_to_mars_store') self.odps.delete_table(table_name, if_exists=True) self.odps.persist_tensor_via_oss(t, table_name, ['x'], 'y', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=table_name) with self.odps.get_table(table_name).open_reader() as reader: result = sorted([(r['x'], r['y']) for r in reader], key=lambda x: x[0]) self.assertEqual(kv, result)
def testPivot(self): data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]] table_name = tn("pyodps_test_mixed_engine_pivot") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn("name3", str(cm.exception)) expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows="id", columns="name", values="fid") expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows="id", columns="name", values="fid") expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop()
def create_many_rows(o): table = 'many_rows' if not o.exist_table(table): df = pd.DataFrame({'a': np.arange(10000, dtype=np.int32)}) o.execute_sql(""" CREATE TABLE many_rows ( a INT ) PARTITIONED BY ( b STRING ) """) DataFrame(df).persist('many_rows', partition="b='blah'", odps=o)
def testFilterPartition(self): self.assertRaises(ExpressionError, lambda: self.expr.filter_partition(None)) self.assertRaises(ExpressionError, lambda: self.expr.filter_partition('part1=a/part2=1,part1=b/part2=2')) self.assertRaises(ExpressionError, lambda: self.expr2.filter_partition('part1/part2=1,part1=b/part2=2')) filtered1 = self.expr2.filter_partition('part1=a/part2=1,part1=b/part2=2') self.assertIsInstance(filtered1, FilterPartitionCollectionExpr) self.assertEqual(filtered1.schema, self.expr.schema) self.assertEqual(filtered1.predicate_string, 'part1=a/part2=1,part1=b/part2=2') filtered2 = self.expr2.filter_partition('part1=a/part2=1,part1=b/part2=2', exclude=False) self.assertIsInstance(filtered2, FilterCollectionExpr) try: import pandas as pd from odps.df import DataFrame pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]], columns=['Field1', 'Field2']) df = DataFrame(pd_df) self.assertRaises(ExpressionError, lambda: df.filter_partition('Fieldd2=2')) except ImportError: pass
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'string', 'int64', 'float64', 'boolean', 'decimal', 'datetime')) self.schema = df_schema_to_odps_schema(schema) import pandas as pd self.data = self._gen_data(20, value_range=(-1000, 1000)) self.df = pd.DataFrame(self.data, columns=schema.names) self.expr = DataFrame(self.df, schema=schema)
def test_t_test(self): options.ml.dry_run = True ds = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') t_test(ds, x_col='a04', _cases=self.gen_check_params_case( {'mu': '0', 'outputTableName': TEMP_TABLE_PREFIX + '_t_test', 'confidenceLevel': '0.95', 'xTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'alternative': 'two.sided', 'xColName': 'a04'})) t_test(ds, x_col='a04', y_col='a05', _cases=self.gen_check_params_case( {'yTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'yColName': 'a05', 'mu': '0', 'outputTableName': TEMP_TABLE_PREFIX + '_t_test', 'confidenceLevel': '0.95', 'xTableName': self.odps.project + '.' + IONOSPHERE_TABLE, 'alternative': 'two.sided', 'xColName': 'a04'}))
def setup(self): import pandas as pd odps_data = [["name1", 1], ["name2", 2], ["name1", 3]] pd_data = [["name1", 5], ["name2", 6]] names = ["name", "id"] types = ["string", "bigint"] table = tn("pyodps_df_mixed") self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps)
def testToPandas(self): table_name = tn('pyodps_test_mixed_engine_to_pandas') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['col%s' % i for i in range(7)], ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal', 'datetime'])) expr2 = DataFrame(table2) data2 = [ [1234567, 3.14, 'test', datetime(2016, 6, 1), True, Decimal('3.14'), None] ] self.odps.write_table(table2, 0, data2) pd_df = expr2.to_pandas() self.assertSequenceEqual(data2[0], pd_df.ix[0].tolist()) wrapped_pd_df = expr2.to_pandas(wrap=True) self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df.execute()))) pd_df_col = expr2.col0.to_pandas() self.assertSequenceEqual([data2[0][0]], pd_df_col.tolist()) wrapped_pd_df_col = expr2.col0.to_pandas(wrap=True) self.assertSequenceEqual([data2[0][0]], list(next(wrapped_pd_df_col.execute()))) pd_df_future = expr2.to_pandas(async=True) self.assertSequenceEqual(data2[0], pd_df_future.result().ix[0].tolist()) wrapped_pd_df_future = expr2.to_pandas(async=True, wrap=True) self.assertSequenceEqual(data2[0], list(next(wrapped_pd_df_future.result().execute()))) delay = Delay() pd_df_future = expr2.to_pandas(delay=delay) delay.execute() self.assertSequenceEqual(data2[0], pd_df_future.result().ix[0].tolist()) exc_future = (expr2.col0 / 0).to_pandas(async=True) self.assertRaises(ODPSError, exc_future.result)
def testExecuteAfterModelCreate(self): from odps.ml import classifiers from odps.ml.expr.models.pmml import PmmlRegressionResult self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category') model = classifiers.LogisticRegression().train(df) model.persist(IRIS_TEST_OFFLINE_MODEL, drop_model=True) expr = PmmlModel(self.odps.get_offline_model(IRIS_TEST_OFFLINE_MODEL)) result = expr.execute() self.assertIsInstance(result, PmmlRegressionResult)
def testFilterParts(self): self.assertRaises(ExpressionError, lambda: self.expr.filter_parts(None)) self.assertRaises(ExpressionError, lambda: self.expr.filter_parts('part3=a')) self.assertRaises(ExpressionError, lambda: self.expr.filter_parts('part1=a,part2=1/part1=b,part2=2')) self.assertRaises(ExpressionError, lambda: self.expr2.filter_parts('part1,part2=1/part1=b,part2=2')) filtered1 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2') self.assertIsInstance(filtered1, FilterPartitionCollectionExpr) self.assertEqual(filtered1.schema, self.expr.schema) self.assertEqual(filtered1.predicate_string, 'part1=a,part2=1/part1=b,part2=2') filtered2 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2', exclude=False) self.assertIsInstance(filtered2, FilterCollectionExpr) try: import pandas as pd from odps.df import DataFrame pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]], columns=['Field1', 'Field2']) df = DataFrame(pd_df) self.assertRaises(ExpressionError, lambda: df.filter_parts('Fieldd2=2')) except ImportError: pass
def test_merge(self): self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) merged2 = merge_data((df1, "col11"), (df2, "col21", True)) self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE")) merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE")) merged4 = df1.merge_with(df2) self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE"))
def test_linear(self): options.runner.dry_run = False self.delete_table(LINEAR_REGRESSION_OUT_TABLE) self.delete_offline_model(MODEL_NAME) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') splited = df.split(0.6) algo = LinearRegression() model = algo.train(splited[0]) model.persist(MODEL_NAME) logging.info('Importance: ', regression_importance(splited[1], model)) predicted = model.predict(splited[1]) # persist is an operational node which will trigger execution of the flow predicted.persist(LINEAR_REGRESSION_OUT_TABLE) logging.info('MSE: ', mean_squared_error(predicted, 'class')) logging.info('MAE: ', mean_absolute_error(predicted, 'class')) logging.info('HIST: ', residual_histogram(predicted, 'class')) logging.info('MSE: ', pearson(predicted, col1='class'))
def testCreateDataFrameFromPartition(self): from odps.types import PartitionSpec test_table_name = tn('pyodps_test_dataframe_partition') schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds'], ['string']) self.odps.delete_table(test_table_name, if_exists=True) table = self.odps.create_table(test_table_name, schema) with table.open_writer('ds=today', create_partition=True) as w: w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']]) try: df = DataFrame(table.get_partition('ds=today')) self.assertEqual(df.count().execute(), 3) df = table.get_partition('ds=today').to_df() partition = df.data self.assertIs(partition.table, table) self.assertEqual(partition.partition_spec, PartitionSpec('ds=today')) self.assertEqual(df.count().execute(), 3) finally: table.drop()
def test_custom_algo(self): options.ml.dry_run = True df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) splited = df.split(0.6) labeled_data = splited[0].label_field("class") naive_bayes = MyNaiveBayes() model = naive_bayes.train(labeled_data)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)), 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '_split' })) model.persist(MODEL_NAME) predicted = model.predict(splited[1]) predicted.persist(MODEL_NAME)
def testPandasPersistODPS2(self): import pandas as pd import numpy as np data_int8 = np.random.randint(0, 10, (1,), dtype=np.int8) data_int16 = np.random.randint(0, 10, (1,), dtype=np.int16) data_int32 = np.random.randint(0, 10, (1,), dtype=np.int32) data_int64 = np.random.randint(0, 10, (1,), dtype=np.int64) data_float32 = np.random.random((1,)).astype(np.float32) data_float64 = np.random.random((1,)).astype(np.float64) df = DataFrame(pd.DataFrame(dict(data_int8=data_int8, data_int16=data_int16, data_int32=data_int32, data_int64=data_int64, data_float32=data_float32, data_float64=data_float64))) tmp_table_name = tn('pyodps_test_mixed_persist_odps2_types') self.odps.delete_table(tmp_table_name, if_exists=True) df.persist(tmp_table_name, lifecycle=1, drop_table=True, odps=self.odps) t = self.odps.get_table(tmp_table_name) expected_types = [odps_types.tinyint, odps_types.smallint, odps_types.int_, odps_types.bigint, odps_types.float_, odps_types.double] self.assertEqual(expected_types, t.schema.types)
def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected)) expr = (self.odps_df.id + 2).isin(self.pd_df['id']).rename('isin') res = self.engine.execute(expr) result = self._get_result(res) expected = [[False], [False], [True]] self.assertEqual(result, expected)
def test_mat_pearson(self): options.ml.dry_run = True df = DataFrame( self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') matrix_pearson(df, _cases=self.gen_check_params_case({ 'outputTableName': 'tmp_pyodps__matrix_pearson', 'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'inputTableName': tn('pyodps_test_ml_ionosphere') }))
def testDataFrameFromPandas(self): import pandas as pd pd_df = pd.DataFrame({'a': [1, 2, 3], 'b': [None, None, None]}) self.assertRaises(TypeError, lambda: DataFrame(pd_df)) df = DataFrame(pd_df, unknown_as_string=True) self.assertEqual(df.schema.get_type('b').name, 'string') df = DataFrame(pd_df[['a']], as_type={'a': 'string'}) self.assertEqual(df.schema.get_type('a').name, 'string') df = DataFrame(pd_df, as_type={'b': 'int'}) self.assertEqual(df.schema.get_type('b').name, 'int64') pd_df = pd.DataFrame({'a': [1, 2, 3], 'b': [[1, 2], [3, 4, 5], [6]]}) self.assertRaises(TypeError, DataFrame, pd_df) df = DataFrame(pd_df, as_type={'b': 'list<int64>'}) self.assertEqual(df.schema.get_type('b').name, 'list<int64>') df = DataFrame(pd_df, as_type={'b': 'list<string>'}) self.assertEqual(df.schema.get_type('b').name, 'list<string>') pd_df = pd.DataFrame({ 'a': [1, 2, 3], 'b': [{ 1: 'a', 2: 'b' }, { 3: 'c', 4: 'd', 5: None }, { 6: 'f' }] }) self.assertRaises(TypeError, DataFrame, pd_df) df = DataFrame(pd_df, as_type={'b': 'dict<int64, string>'}) self.assertEqual(df.schema.get_type('b').name, 'dict<int64,string>') df = DataFrame(pd_df, as_type={'b': 'dict<string, string>'}) self.assertEqual(df.schema.get_type('b').name, 'dict<string,string>')
def test_histograms(self): options.ml.dry_run = True ds = DataFrame( self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') histograms(ds, _cases=self.gen_check_params_case({ 'outputTableName': TEMP_TABLE_PREFIX + '_histograms', 'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'intervalNum': '10', 'inputTableName': IONOSPHERE_TABLE }))
def test_quantile(self): options.ml.dry_run = True df = DataFrame( self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') qt = quantile(df, _cases=self.gen_check_params_case({ 'inputTableName': tn('pyodps_test_ml_ionosphere'), 'outputTableName': tn('pyodps_test_ml_iono_quantile'), 'colName': ','.join('a%02d' % i for i in range(1, 35)), 'N': '100' })) qt.persist(IONOSPHERE_QUANTILE_TABLE)
def create_test_pmml_model(self, model_name): if self.odps.exist_offline_model(model_name): return old_dry_run = options.ml.dry_run options.ml.dry_run = False self.create_iris(IRIS_TABLE) from odps.df import DataFrame from odps.ml import classifiers df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category') lr = classifiers.LogisticRegression(epsilon=0.001).set_max_iter(50) lr.train(df).persist(model_name) options.ml.dry_run = old_dry_run