def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected))
def test_ml_end(self): old_interactive = options.interactive options.interactive = True self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).sample(n=20) repr(df) options.interactive = old_interactive
def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected))
def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected))
def testPandasGroupbyFilter(self): import pandas as pd data = [ [2001, 1], [2002, 2], [2003, 3] ] df = DataFrame(pd.DataFrame(data, columns=['id', 'fid'])) df2 = df.groupby('id').agg(df.fid.sum()) df3 = df2[df2.id == 2003] expected = [ [2003, 3] ] self.assertEqual(df3.execute().values.values.tolist(), expected) df2 = df.groupby('id').agg(df.fid.sum()) df2.execute() self.assertTrue(context.is_cached(df2)) df3 = df2[df2.id == 2003] self.assertEqual(df3.execute().values.values.tolist(), expected) self.assertEqual(df3.execute().values.values.tolist(), expected) df4 = df.fid.sum() self.assertEqual(df4.execute(), 6) self.assertEqual(df4.execute(), 6)
def testCacheTable(self): df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = df2.compile() self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = df._cache_data self.assertEqual(len(df.execute()), len(expected)) self.assertIs(df._cache_data, table) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(df4._cache_data, 2)
def testCachePersist(self): expr = self.odps_df data2 = [ ['name1', 3.2], ['name3', 2.4] ] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) # test seahawks fallback self.assertEqual(t.input.count().execute(), 2) output_t.drop()
def testFilterParts(self): self.assertRaises(ExpressionError, lambda: self.expr.filter_parts(None)) self.assertRaises( ExpressionError, lambda: self.expr.filter_parts('part1=a,part2=1/part1=b,part2=2')) self.assertRaises( ExpressionError, lambda: self.expr2.filter_parts('part1,part2=1/part1=b,part2=2')) filtered1 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2') self.assertIsInstance(filtered1, FilterPartitionCollectionExpr) self.assertEqual(filtered1.schema, self.expr.schema) self.assertEqual(filtered1.predicate_string, 'part1=a,part2=1/part1=b,part2=2') filtered2 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2', exclude=False) self.assertIsInstance(filtered2, FilterCollectionExpr) try: import pandas as pd from odps.df import DataFrame pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]], columns=['Field1', 'Field2']) df = DataFrame(pd_df) self.assertRaises(ExpressionError, lambda: df.filter_parts('Fieldd2=2')) except ImportError: pass
def testCacheTable(self): self.engine._selecter.force_odps = True df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = context.get_cached(df) self.assertEqual(len(self.engine.execute(df)), len(expected)) self.assertIs(context.get_cached(df), table) if not isinstance(table, SeahawksTable): self.assertEqual(context.get_cached(df).lifecycle, 1) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(context.get_cached(df4), 2)
def test_edge_density(self): df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE)) output = EdgeDensity(from_vertex_col='flow_out_id', to_vertex_col='flow_in_id') \ .transform(df)._add_case(self.gen_check_params_case( {'splitSize': '64', 'workerMem': '4096', 'fromVertexCol': 'flow_out_id', 'toVertexCol': 'flow_in_id', 'outputTableName': EDGE_DENSITY_TABLE, 'inputEdgeTableName': WEIGHTED_GRAPH_EDGE_TABLE})) output.persist(EDGE_DENSITY_TABLE)
def test_maximal_connected(self): df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE)) output = MaximalConnectedComponent(from_vertex_col='flow_out_id', to_vertex_col='flow_in_id') \ .transform(df)._add_case(self.gen_check_params_case( {'splitSize': '64', 'workerMem': '4096', 'fromVertexCol': 'flow_out_id', 'toVertexCol': 'flow_in_id', 'outputTableName': MAXIMAL_CONNECTED_TABLE, 'inputEdgeTableName': WEIGHTED_GRAPH_EDGE_TABLE})) output.persist(MAXIMAL_CONNECTED_TABLE)
def testRepeatSetItem(self): df = DataFrame(self.table) df['rank'] = df.groupby('name').sort('id').id.rank() df['rank'] = df.groupby('name').sort('id').id.rank() self.assertEqual(len(df.execute()), 3)
def test_logistic_partition_df(self): options.ml.dry_run = True self.maxDiff = None self.create_ionosphere_one_part(IONOSPHERE_TABLE_ONE_PART) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_ONE_PART).get_partition("part=0")) \ .roles(label='class') lr = LogisticRegression(epsilon=0.001).set_max_iter(50) model = lr.train(df)._add_case( self.gen_check_params_case({ 'labelColName': 'class', 'modelName': MODEL_NAME, 'inputTableName': IONOSPHERE_TABLE_ONE_PART, 'epsilon': '0.001', 'inputTablePartitions': "part=0", 'regularizedLevel': '1', 'regularizedType': 'l1', 'maxIter': '50', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)) })) model.persist(MODEL_NAME)
def testDataFrame(self): df = DataFrame(self.table) self.assertEqual(3, df.count().execute()) self.assertEqual(1, df[df.name == 'name1'].count().execute()) res = df[df.name.contains('中文')].execute() self.assertGreaterEqual(len(res), 0)
def test_direct_method(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def test_filter_noises(self): self.odps.delete_table(FILTERED_WORDS_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) self._create_noise_table(NOISE_TABLE) df = DataFrame( self.odps.get_table(SPLITED_TABLE)).roles(doc_content='content') ndf = DataFrame(self.odps.get_table(NOISE_TABLE)) filtered = filter_noises(df, ndf) filtered._add_case( self.gen_check_params_case({ 'noiseTableName': NOISE_TABLE, 'outputTableName': FILTERED_WORDS_TABLE, 'selectedColNames': 'content', 'inputTableName': SPLITED_TABLE })) filtered.persist(FILTERED_WORDS_TABLE)
def test_count_ngram(self): self.create_word_triple(WORD_TRIPLE_TABLE) word_triple_df = DataFrame(self.odps.get_table(WORD_TRIPLE_TABLE)).select_features('word') counted = count_ngram(word_triple_df) counted._add_case(self.gen_check_params_case({ 'outputTableName': COUNT_NGRAM_TABLE, 'inputSelectedColNames': 'word', 'order': '3', 'inputTableName': WORD_TRIPLE_TABLE})) counted.persist(COUNT_NGRAM_TABLE)
def test_mat_pearson(self): options.ml.dry_run = True df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') matrix_pearson(df, _cases=self.gen_check_params_case( {'outputTableName': 'tmp_pyodps__matrix_pearson', 'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'inputTableName': tn('pyodps_test_ml_ionosphere')}))
def test_chisquare(self): options.ml.dry_run = True df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) chi_square(df, x_col=df.a01, y_col='class', _cases=self.gen_check_params_case( {'yColName': 'class', 'xColName': 'a01', 'outputDetailTableName': 'tmp_pyodps__chi_square', 'outputTableName': 'tmp_pyodps__chi_square', 'inputTableName': tn('pyodps_test_ml_ionosphere')}))
def test_histograms(self): options.ml.dry_run = True ds = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') histograms(ds, _cases=self.gen_check_params_case({ 'outputTableName': TEMP_TABLE_PREFIX + '_histograms', 'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)), 'intervalNum': '10', 'inputTableName': IONOSPHERE_TABLE}))
def test_df_store(self): self.delete_table(IONOSPHERE_SORTED_TABLE_PART) self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS) df = DataFrame(self.odps.get_table( IONOSPHERE_TABLE_TWO_PARTS)).filter_parts('part1=1,part2=2') self.odps.delete_table(IONOSPHERE_SORTED_TABLE_PART) sorted_df = df.groupby(df['class']).agg( df.a01.count().rename('count')).sort('class', ascending=False) sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
def test_str_diff(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) diff_df = str_diff(df, col1='col1', col2='col2') diff_df._add_case(self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputTableName': COMP_RESULT_TABLE, 'inputSelectedColName2': 'col2', 'inputSelectedColName1': 'col1', 'method': 'levenshtein_sim', 'lambda': '0.5', 'outputColName': 'output'})) diff_df.persist(COMP_RESULT_TABLE)
def setup(self): datatypes = lambda *types: [validate_data_type(t) for t in types] schema = Schema.from_lists( ['name', 'id', 'fid', 'isMale', 'scale', 'birth'], datatypes('string', 'bigint', 'double', 'boolean', 'decimal', 'datetime')) table = MockTable(name='pyodps_test_expr_table', schema=schema) self.tb = DataFrame(table) import pandas as pd df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]], columns=['name', 'id', 'fid']) self.pd = DataFrame(df) self.expr = self.tb.join(self.pd, on='name') self.engine = MixedEngine(self.odps)
def test_quantile(self): options.ml.dry_run = True df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class') qt = quantile(df, _cases=self.gen_check_params_case( {'inputTableName': tn('pyodps_test_ml_ionosphere'), 'outputTableName': tn('pyodps_test_ml_iono_quantile'), 'colName': ','.join('a%02d' % i for i in range(1, 35)), 'N': '100'})) qt.persist(IONOSPHERE_QUANTILE_TABLE)
def test_tfidf_code(self): self.delete_table(TFIDF_TABLE) self.create_corpus(CORPUS_TABLE) df = DataFrame( self.odps.get_table(CORPUS_TABLE)).doc_content_field('content') pl = Pipeline(SplitWord()) TFIDF().link(DocWordStat().link(pl).triple) ret_df = pl.transform(df) ret_df.persist(TFIDF_TABLE)
def testUnicodePdDataFrame(self): import pandas as pd pd_df = pd.DataFrame([['中文'], [to_text('中文2')]], columns=[to_text('字段')]) df = DataFrame(pd_df) r = df['字段'].execute() self.assertEqual(to_text('中文'), to_text(r[0][0])) self.assertEqual(to_text('中文2'), to_text(r[1][0]))
def test_tfidf_array(self): self.delete_table(W2V_TABLE) self.create_corpus(CORPUS_TABLE) df = DataFrame( self.odps.get_table(CORPUS_TABLE)).doc_content_field('content') pl = Pipeline( Pipeline(SplitWord(), (DocWordStat(), 'multi'), Word2Vec())) word_feature, _ = pl.transform(df) word_feature.persist(W2V_TABLE)
def testExecuteModel(self): from odps.ml import classifiers from odps.ml.expr.models.pmml import PmmlRegressionResult self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category') model = classifiers.LogisticRegression().train(df) result = model.execute() self.assertIsInstance(result, PmmlRegressionResult)
def test_top_n(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) top_n_df = top_n_similarity(df, df, col='col1', map_col='col1') top_n_df._add_case(self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output', 'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1', 'outputTableName': TOP_N_TABLE, 'mapTableName': self.odps.project + '.' + STR_COMP_TABLE, 'method': 'levenshtein_sim', 'lambda': '0.5'})) top_n_df.persist(TOP_N_TABLE)
def testHeadAndTail(self): df = DataFrame(self.table) self.assertEqual(1, len(df.head(1))) self.assertEqual(2, len(df.head(2))) self.assertEqual([3, 'name3'], list(df.tail(1)[0])) r = df[df.name == 'name2'].head(1) self.assertEqual(1, len(r)) self.assertEqual([2, 'name2'], list(r[0]))