def test_df_consecutive(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df.roles(label='class') df.head(10) train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted.to_pandas()
def test_df_consecutive(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df.roles(label='class') df.head(10) df['b01'] = df['a06'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) predicted['appended_col'] = predicted['prediction_score'] * 2 predicted.to_pandas()
def test_df_combined(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df[df['a04'] != 0] df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class'] df = df.roles(label='class') df = df[df.a05 != 0].cache() df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class'] train, test = df.split(0.6) lr = LogisticRegression(epsilon=0.01) model = lr.train(train) predicted = model.predict(test) (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + ( (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename( 't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
def test_dynamic_output(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) df = df.roles(label=df['class']) filtered, importance = select_features(df) print(filtered.describe().execute())
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_corpus(CORPUS_TABLE) self.df = DataFrame(self.odps.get_table(CORPUS_TABLE)).roles( doc_id='id', doc_content='content') options.ml.dry_run = True def _create_str_compare_table(self, table_name): data_rows = [ ['inputTableName', 'inputTableName'], ['outputTableName', 'mapTableName'], ['inputSelectedColName1', 'outputTableName'], ['inputSelectedColName2', 'inputSelectedColName'], ['inputAppendColNames', 'mapSelectedColName'], ['inputTablePartitions', 'inputAppendColNames'], ['outputColName', 'inputAppendRenameColNames'], ['method', 'mapAppendColNames'], ['lambda', 'mapAppendRenameColNames'], ['k', 'inputTablePartitions'], ['lifecycle', 'mapTablePartitions'], ['coreNum', 'outputColName'], ['memSizePerCore', 'method'], ] for idx, r in enumerate(data_rows): data_rows[idx] = [idx] + r self.odps.execute_sql('drop table if exists ' + table_name) self.odps.execute_sql( 'create table %s (str_id bigint, col1 string, col2 string)' % table_name) self.odps.write_table(table_name, data_rows) def _create_noise_table(self, table_name): data_rows = (u',', u'。', u'《', u'》', u'的', u'是') data_rows = [[v] for v in data_rows] self.odps.execute_sql('drop table if exists ' + table_name) self.odps.execute_sql('create table %s (noise_col string)' % table_name) self.odps.write_table(table_name, data_rows) def test_tf_idf(self): splited = SplitWord().transform(self.df) freq, _ = DocWordStat().transform(splited) tf_set = TFIDF().transform(freq) tf_set._add_case( self.gen_check_params_case({ 'docIdCol': 'id', 'inputTableName': TEMP_TABLE_PREFIX + '_doc_word_stat', 'countCol': 'count', 'outputTableName': TFIDF_TABLE, 'wordCol': 'word' })) tf_set.persist(TFIDF_TABLE) def test_str_diff(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) diff_df = str_diff(df, col1='col1', col2='col2') diff_df._add_case( self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputTableName': COMP_RESULT_TABLE, 'inputSelectedColName2': 'col2', 'inputSelectedColName1': 'col1', 'method': 'levenshtein_sim', 'lambda': '0.5', 'outputColName': 'output' })) diff_df.persist(COMP_RESULT_TABLE) def test_top_n(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) top_n_df = top_n_similarity(df, df, col='col1', map_col='col1') top_n_df._add_case( self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output', 'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1', 'outputTableName': TOP_N_TABLE, 'mapTableName': STR_COMP_TABLE, 'method': 'levenshtein_sim', 'lambda': '0.5' })) top_n_df.persist(TOP_N_TABLE) def test_filter_noises(self): self.odps.delete_table(FILTERED_WORDS_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) self._create_noise_table(NOISE_TABLE) df = DataFrame( self.odps.get_table(SPLITED_TABLE)).roles(doc_content='content') ndf = DataFrame(self.odps.get_table(NOISE_TABLE)) filtered = filter_noises(df, ndf) filtered._add_case( self.gen_check_params_case({ 'noiseTableName': NOISE_TABLE, 'outputTableName': FILTERED_WORDS_TABLE, 'selectedColNames': 'content', 'inputTableName': SPLITED_TABLE })) filtered.persist(FILTERED_WORDS_TABLE) def test_keywords_extraction(self): self.odps.delete_table(KW_EXTRACTED_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) df = DataFrame(self.odps.get_table(SPLITED_TABLE)).roles( doc_id='doc_id', doc_content='content') extracted = extract_keywords(df) extracted._add_case( self.gen_check_params_case({ 'dumpingFactor': '0.85', 'inputTableName': SPLITED_TABLE, 'epsilon': '0.000001', 'windowSize': '2', 'topN': '5', 'outputTableName': KW_EXTRACTED_TABLE, 'docIdCol': 'doc_id', 'maxIter': '100', 'docContent': 'content' })) extracted.persist(KW_EXTRACTED_TABLE) def test_summarize_text(self): self.create_corpus(CORPUS_TABLE) summarized = summarize_text(self.df.roles(sentence='content')) summarized._add_case( self.gen_check_params_case({ 'dumpingFactor': '0.85', 'inputTableName': CORPUS_TABLE, 'sentenceCol': 'content', 'epsilon': '0.000001', 'k': '2', 'topN': '3', 'outputTableName': TEXT_SUMMARIZED_TABLE, 'docIdCol': 'id', 'maxIter': '100', 'similarityType': 'lcs_sim', 'lambda': '0.5' })) summarized.persist(TEXT_SUMMARIZED_TABLE) def test_count_ngram(self): self.create_word_triple(WORD_TRIPLE_TABLE) word_triple_df = DataFrame( self.odps.get_table(WORD_TRIPLE_TABLE)).select_features('word') counted = count_ngram(word_triple_df) counted._add_case( self.gen_check_params_case({ 'outputTableName': COUNT_NGRAM_TABLE, 'inputSelectedColNames': 'word', 'order': '3', 'inputTableName': WORD_TRIPLE_TABLE })) counted.persist(COUNT_NGRAM_TABLE) def test_doc2vec(self): word_df, doc_df, _ = Doc2Vec().transform(self.df) doc_df._add_case( self.gen_check_params_case({ 'minCount': '5', 'docColName': 'content', 'hs': '1', 'inputTableName': tn('pyodps_test_ml_corpus'), 'negative': '0', 'layerSize': '100', 'sample': '0', 'randomWindow': '1', 'window': '5', 'docIdColName': 'id', 'iterTrain': '1', 'alpha': '0.025', 'cbow': '0', 'outVocabularyTableName': 'tmp_pyodps__doc2_vec', 'outputWordTableName': 'tmp_pyodps__doc2_vec', 'outputDocTableName': tn('pyodps_test_ml_doc2vec_doc_result') })) doc_df.persist(DOC2VEC_DOC_TABLE) def test_semantic_vector_distance(self): result_df = semantic_vector_distance(self.df) result_df._add_case( self.gen_check_params_case({ 'topN': '5', 'outputTableName': tn('pyodps_test_ml_semantic_dist_result'), 'distanceType': 'euclidean', 'inputTableName': tn('pyodps_test_ml_corpus') })) result_df.persist(SEMANTIC_DIST_TABLE)
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def testCollectionLabelling(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features('sepal_length sepal_width petal_length') self.assertEqual( _df_roles(df2), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='')) df3 = df2.select_features('petal_width', add=True) self.assertEqual( _df_roles(df3), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields('sepal_length sepal_width') self.assertEqual( _df_roles(df4), dict(category='', sepal_width='', sepal_length='', petal_length='FEATURE', petal_width='FEATURE')) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field('sepal_width') self.assertEqual( _df_roles(df5), dict(category='', sepal_width='WEIGHT', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field('category') self.assertEqual( _df_roles(df6), dict(category='LABEL', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label='category', weight='sepal_width') self.assertEqual( _df_roles(df7), dict(category='LABEL', petal_length='FEATURE', petal_width='FEATURE', sepal_width='WEIGHT', sepal_length='FEATURE')) # discrete df8 = self.df.discrete('sepal_width, sepal_length') self.assertEqual( _df_continuity(df8), dict(category='DISCRETE', sepal_width='DISCRETE', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # continuous df9 = df8.continuous('sepal_width') self.assertEqual( _df_continuity(df9), dict(category='DISCRETE', sepal_width='CONTINUOUS', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # key_value df10 = self.df.key_value('sepal_length sepal_width') self.assertEqual( _df_key_value(df10), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=:, item=,)')) df11 = df10.key_value('sepal_length', kv='-', item=';') self.assertEqual( _df_key_value(df11), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value df12 = df10.erase_key_value('sepal_width') self.assertEqual( _df_key_value(df12), dict(category='', petal_length='', petal_width='', sepal_width='', sepal_length='KVConfig(kv=:, item=,)')) def testSeqFieldOperations(self): seq = self.df.sepal_length # roles seq1 = seq.role('weight') self.assertEqual(_df_roles(seq1), dict(sepal_length='WEIGHT')) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length='DISCRETE')) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length='CONTINUOUS')) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length='KVConfig(kv=:, item=,)')) seq5 = seq4.key_value(kv='-', item=';') self.assertEqual(_df_key_value(seq5), dict(sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length='')) def testCollectionOperations(self): splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) self.assertEqual(splited[0]._algo, 'Split') self.assertEqual(splited[0]._params['fraction'], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict(category='FEATURE', petal_length='FEATURE', petal_width='FEATURE', sepal_width='FEATURE', sepal_length='FEATURE', append_id='')) self.assertEqual(id_appended._algo, 'AppendID') self.assertEqual(id_appended._params['IDColName'], 'append_id') def testDTypes(self): rstrip_lines = lambda s: '\n'.join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """)).strip() self.assertEqual( rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label='category').key_value('sepal_length') new_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """)).strip() self.assertEqual( rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def testMerge(self): from odps.ml.expr.mixin import merge_data self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col11 string, col12 string) lifecycle 1'.format( TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col21 string, col22 string) lifecycle 1'.format( TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual( _df_roles(merged1), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) merged2 = merge_data((df1, 'col11'), (df2, 'col21', True)) self.assertEqual(_df_roles(merged2), dict(col11='FEATURE', col22='FEATURE')) merged3 = merge_data((df1, 'col11'), (df2, 'col21', True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11='FEATURE', t1_col22='FEATURE')) merged4 = df1.merge_with(df2) self.assertEqual( _df_roles(merged4), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) options.ml.dry_run = True merged4._add_case( self.gen_check_params_case({ 'outputTableName': 'merged_table', 'inputTableNames': TEMP_TABLE_1_NAME + ',' + TEMP_TABLE_2_NAME, 'inputPartitionsInfoList': ',', 'selectedColNamesList': 'col11,col12;col21,col22' })) merged4.persist('merged_table') def testSampleClass(self): from ..core import AlgoExprMixin num_sampled = self.df.sample(n=20) self.assertIsInstance(num_sampled, AlgoExprMixin) self.assertEqual(num_sampled._algo, 'RandomSample') frac_sampled = self.df.sample(frac=0.5) self.assertIsInstance(frac_sampled, AlgoExprMixin) self.assertEqual(frac_sampled._algo, 'RandomSample') weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) self.assertIsInstance(weighted_sampled, AlgoExprMixin) self.assertEqual(weighted_sampled._algo, 'WeightedSample') self.assertEqual(weighted_sampled._params['probCol'], 'sepal_length') stratified_sampled = self.df.sample(frac={'Iris-setosa': 0.5}, strata='category') self.assertIsInstance(stratified_sampled, AlgoExprMixin) self.assertEqual(stratified_sampled._algo, 'StratifiedSample')
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def test_coll_field_operations(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features("sepal_length sepal_width petal_length") self.assertEqual( _df_roles(df2), dict(category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width=""), ) df3 = df2.select_features("petal_width", add=True) self.assertEqual( _df_roles(df3), dict( category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields("sepal_length sepal_width") self.assertEqual( _df_roles(df4), dict(category="", sepal_width="", sepal_length="", petal_length="FEATURE", petal_width="FEATURE"), ) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field("sepal_width") self.assertEqual( _df_roles(df5), dict( category="", sepal_width="WEIGHT", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE" ), ) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field("category") self.assertEqual( _df_roles(df6), dict( category="LABEL", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label="category", weight="sepal_width") self.assertEqual( _df_roles(df7), dict( category="LABEL", petal_length="FEATURE", petal_width="FEATURE", sepal_width="WEIGHT", sepal_length="FEATURE", ), ) # discrete df8 = self.df.discrete("sepal_width, sepal_length") self.assertEqual( _df_continuity(df8), dict( category="DISCRETE", sepal_width="DISCRETE", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # continuous df9 = df8.continuous("sepal_width") self.assertEqual( _df_continuity(df9), dict( category="DISCRETE", sepal_width="CONTINUOUS", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # key_value df10 = self.df.key_value("sepal_length sepal_width") self.assertEqual( _df_key_value(df10), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=:, item=,)", ), ) df11 = df10.key_value("sepal_length", kv="-", item=";") self.assertEqual( _df_key_value(df11), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=-, item=;)", ), ) # erase_key_value df12 = df10.erase_key_value("sepal_width") self.assertEqual( _df_key_value(df12), dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"), ) def test_seq_field_operations(self): seq = self.df.sepal_length # roles seq1 = seq.role("weight") self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT")) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE")) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS")) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)")) seq5 = seq4.key_value(kv="-", item=";") self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)")) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length="")) def test_coll_df_operations(self): from odps.ml.nodes import transform_nodes as tnodes splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) split_node = adapter_from_df(splited[0])._bind_node self.assertEqual(split_node.code_name, "Split") self.assertEqual(split_node.parameters["fraction"], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict( category="FEATURE", petal_length="FEATURE", petal_width="FEATURE", sepal_width="FEATURE", sepal_length="FEATURE", append_id="", ), ) append_id_node = adapter_from_df(id_appended)._bind_node self.assertEqual(append_id_node.code_name, "AppendID") self.assertEqual(append_id_node.parameters["IDColName"], "append_id") summary_ep = self.df._create_summary_adapter() summary_node = summary_ep._bind_node self.assertIsInstance(summary_node, tnodes.SummaryNode) def test_dtypes(self): rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """ ) ).strip() self.assertEqual(rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label="category").key_value("sepal_length") new_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """ ) ).strip() self.assertEqual(rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def test_merge(self): self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) merged2 = merge_data((df1, "col11"), (df2, "col21", True)) self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE")) merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE")) merged4 = df1.merge_with(df2) self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) def test_sample(self): num_sampled = self.df.sample(n=20) adapter = adapter_from_df(num_sampled) self.assertIsInstance(num_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") frac_sampled = self.df.sample(frac=0.5) adapter = adapter_from_df(frac_sampled) self.assertIsInstance(frac_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) adapter = adapter_from_df(weighted_sampled) self.assertIsInstance(weighted_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "WeightedSample") self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length") stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category") adapter = adapter_from_df(stratified_sampled) self.assertIsInstance(stratified_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "StratifiedSample") def test_batch_persist(self): options.runner.dry_run = False call_seq = [] dfs = [] tables = [] for idx in range(3): write_str = "F%d" % idx def gen_fun(wobj): return lambda _: call_seq.append(wobj) f = gen_fun((write_str, "U")) df_upper = self.mock_action(self.df, action=f) f = gen_fun((write_str, "D")) df_lower = self.mock_action(df_upper, action=f) dfs.append(df_lower) tables.append("TN" + str(idx)) DataFrame.batch_persist(dfs, tables) for idx in range(3): write_str = "F%d" % idx self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD")) for dir in "UD": self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_corpus(CORPUS_TABLE) self.df = DataFrame(self.odps.get_table(CORPUS_TABLE)).roles(doc_id='id', doc_content='content') options.runner.dry_run = True def _create_str_compare_table(self, table_name): data_rows = [ ['inputTableName', 'inputTableName'], ['outputTableName', 'mapTableName'], ['inputSelectedColName1', 'outputTableName'], ['inputSelectedColName2', 'inputSelectedColName'], ['inputAppendColNames', 'mapSelectedColName'], ['inputTablePartitions', 'inputAppendColNames'], ['outputColName', 'inputAppendRenameColNames'], ['method', 'mapAppendColNames'], ['lambda', 'mapAppendRenameColNames'], ['k', 'inputTablePartitions'], ['lifecycle', 'mapTablePartitions'], ['coreNum', 'outputColName'], ['memSizePerCore', 'method'], ] for idx, r in enumerate(data_rows): data_rows[idx] = [idx] + r self.odps.execute_sql('drop table if exists ' + table_name) self.odps.execute_sql('create table %s (str_id bigint, col1 string, col2 string)' % table_name) self.odps.write_table(table_name, data_rows) def _create_noise_table(self, table_name): data_rows = (u',', u'。', u'《', u'》', u'的', u'是') data_rows = [[v] for v in data_rows] self.odps.execute_sql('drop table if exists ' + table_name) self.odps.execute_sql('create table %s (noise_col string)' % table_name) self.odps.write_table(table_name, data_rows) def test_tf_idf(self): splited = SplitWord().transform(self.df) freq, _ = DocWordStat().transform(splited) tf_set = TFIDF().transform(freq) tf_set._add_case(self.gen_check_params_case({ 'docIdCol': 'id', 'inputTableName': TEMP_TABLE_PREFIX + '0_doc_word_stat_3_1', 'countCol': 'count', 'outputTableName': TFIDF_TABLE, 'wordCol': 'word'})) tf_set.persist(TFIDF_TABLE) def test_str_diff(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) diff_df = str_diff(df, col1='col1', col2='col2') diff_df._add_case(self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputTableName': COMP_RESULT_TABLE, 'inputSelectedColName2': 'col2', 'inputSelectedColName1': 'col1', 'method': 'levenshtein_sim', 'lambda': '0.5', 'outputColName': 'output'})) diff_df.persist(COMP_RESULT_TABLE) def test_top_n(self): self._create_str_compare_table(STR_COMP_TABLE) df = DataFrame(self.odps.get_table(STR_COMP_TABLE)) top_n_df = top_n_similarity(df, df, col='col1', map_col='col1') top_n_df._add_case(self.gen_check_params_case({ 'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output', 'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1', 'outputTableName': TOP_N_TABLE, 'mapTableName': STR_COMP_TABLE, 'method': 'levenshtein_sim', 'lambda': '0.5'})) top_n_df.persist(TOP_N_TABLE) def test_filter_noises(self): self.odps.delete_table(FILTERED_WORDS_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) self._create_noise_table(NOISE_TABLE) df = DataFrame(self.odps.get_table(SPLITED_TABLE)).roles(doc_content='content') ndf = DataFrame(self.odps.get_table(NOISE_TABLE)) filtered = filter_noises(df, ndf) filtered._add_case(self.gen_check_params_case({ 'noiseTableName': NOISE_TABLE, 'outputTableName': FILTERED_WORDS_TABLE, 'selectedColNames': 'content', 'inputTableName': SPLITED_TABLE})) filtered.persist(FILTERED_WORDS_TABLE) def test_keywords_extraction(self): self.odps.delete_table(KW_EXTRACTED_TABLE, if_exists=True) self.create_splited_words(SPLITED_TABLE) df = DataFrame(self.odps.get_table(SPLITED_TABLE)).roles(doc_id='doc_id', doc_content='content') extracted = extract_keywords(df) extracted._add_case(self.gen_check_params_case( {'dumpingFactor': '0.85', 'inputTableName': SPLITED_TABLE, 'epsilon': '0.000001', 'windowSize': '2', 'topN': '5', 'outputTableName': KW_EXTRACTED_TABLE, 'docIdCol': 'doc_id', 'maxIter': '100', 'docContent': 'content'})) extracted.persist(KW_EXTRACTED_TABLE) def test_summarize_text(self): self.create_corpus(CORPUS_TABLE) summarized = summarize_text(self.df.roles(sentence='content')) summarized._add_case(self.gen_check_params_case( {'dumpingFactor': '0.85', 'inputTableName': CORPUS_TABLE, 'sentenceCol': 'content', 'epsilon': '0.000001', 'k': '2', 'topN': '3', 'outputTableName': TEXT_SUMMARIZED_TABLE, 'docIdCol': 'id', 'maxIter': '100', 'similarityType': 'lcs_sim', 'lambda': '0.5'})) summarized.persist(TEXT_SUMMARIZED_TABLE) def test_count_ngram(self): self.create_word_triple(WORD_TRIPLE_TABLE) word_triple_df = DataFrame(self.odps.get_table(WORD_TRIPLE_TABLE)).select_features('word') counted = count_ngram(word_triple_df) counted._add_case(self.gen_check_params_case({ 'outputTableName': COUNT_NGRAM_TABLE, 'inputSelectedColNames': 'word', 'order': '3', 'inputTableName': WORD_TRIPLE_TABLE})) counted.persist(COUNT_NGRAM_TABLE)