Exemple #1
0
    def testIsIn(self):
        expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values
        self.assertTrue(result.equals(expected))
 def test_ml_end(self):
     old_interactive = options.interactive
     options.interactive = True
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).sample(n=20)
     repr(df)
     options.interactive = old_interactive
Exemple #3
0
    def testJoin(self):
        expr = self.odps_df.join(self.pd_df, 'name').sort('id_x')
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))
Exemple #4
0
    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))
Exemple #5
0
    def testPandasGroupbyFilter(self):
        import pandas as pd

        data = [
            [2001, 1],
            [2002, 2],
            [2003, 3]
        ]
        df = DataFrame(pd.DataFrame(data, columns=['id', 'fid']))

        df2 = df.groupby('id').agg(df.fid.sum())
        df3 = df2[df2.id == 2003]

        expected = [
            [2003, 3]
        ]

        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df2 = df.groupby('id').agg(df.fid.sum())
        df2.execute()
        self.assertTrue(context.is_cached(df2))
        df3 = df2[df2.id == 2003]

        self.assertEqual(df3.execute().values.values.tolist(), expected)
        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df4 = df.fid.sum()
        self.assertEqual(df4.execute(), 6)
        self.assertEqual(df4.execute(), 6)
Exemple #6
0
    def testCacheTable(self):
        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = df2.compile()
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = df._cache_data
        self.assertEqual(len(df.execute()), len(expected))

        self.assertIs(df._cache_data, table)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(df4._cache_data, 2)
Exemple #7
0
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [
            ['name1', 3.2],
            ['name3', 2.4]
        ]

        table_name = tn('pyodps_test_mixed_engine_cp_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name,
                                        schema=Schema.from_lists(['name', 'fid'], ['string', 'double']))
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache()

        output_table = tn('pyodps_test_mixed_engine_cp_output_table')
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string'])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition='ds=today', create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        # test seahawks fallback
        self.assertEqual(t.input.count().execute(), 2)

        output_t.drop()
Exemple #8
0
    def testFilterParts(self):
        self.assertRaises(ExpressionError,
                          lambda: self.expr.filter_parts(None))
        self.assertRaises(
            ExpressionError,
            lambda: self.expr.filter_parts('part1=a,part2=1/part1=b,part2=2'))
        self.assertRaises(
            ExpressionError,
            lambda: self.expr2.filter_parts('part1,part2=1/part1=b,part2=2'))

        filtered1 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2')
        self.assertIsInstance(filtered1, FilterPartitionCollectionExpr)
        self.assertEqual(filtered1.schema, self.expr.schema)
        self.assertEqual(filtered1.predicate_string,
                         'part1=a,part2=1/part1=b,part2=2')

        filtered2 = self.expr2.filter_parts('part1=a,part2=1/part1=b,part2=2',
                                            exclude=False)
        self.assertIsInstance(filtered2, FilterCollectionExpr)

        try:
            import pandas as pd
            from odps.df import DataFrame
            pd_df = pd.DataFrame([['Col1', 1], ['Col2', 2]],
                                 columns=['Field1', 'Field2'])
            df = DataFrame(pd_df)
            self.assertRaises(ExpressionError,
                              lambda: df.filter_parts('Fieldd2=2'))
        except ImportError:
            pass
Exemple #9
0
    def testCacheTable(self):
        self.engine._selecter.force_odps = True

        df = self.odps_df.join(self.pd_df, 'name').cache()
        df2 = df.sort('id_x')

        dag = self.engine.compile(df2)
        self.assertEqual(len(dag.nodes()), 3)

        result = self.engine.execute(df2).values

        df3 = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values
        self.assertTrue(result.equals(expected))

        self.assertEqual(len(self.engine._generated_table_names), 2)

        table = context.get_cached(df)
        self.assertEqual(len(self.engine.execute(df)), len(expected))

        self.assertIs(context.get_cached(df), table)
        if not isinstance(table, SeahawksTable):
            self.assertEqual(context.get_cached(df).lifecycle, 1)

        df4 = df[df.id_x < 3].count()
        result = self.engine.execute(df4)
        self.assertEqual(result, 2)

        self.assertEqual(context.get_cached(df4), 2)
 def test_edge_density(self):
     df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE))
     output = EdgeDensity(from_vertex_col='flow_out_id', to_vertex_col='flow_in_id') \
         .transform(df)._add_case(self.gen_check_params_case(
         {'splitSize': '64', 'workerMem': '4096', 'fromVertexCol': 'flow_out_id', 'toVertexCol': 'flow_in_id',
          'outputTableName': EDGE_DENSITY_TABLE, 'inputEdgeTableName': WEIGHTED_GRAPH_EDGE_TABLE}))
     output.persist(EDGE_DENSITY_TABLE)
 def test_maximal_connected(self):
     df = DataFrame(self.odps.get_table(WEIGHTED_GRAPH_EDGE_TABLE))
     output = MaximalConnectedComponent(from_vertex_col='flow_out_id', to_vertex_col='flow_in_id') \
         .transform(df)._add_case(self.gen_check_params_case(
         {'splitSize': '64', 'workerMem': '4096', 'fromVertexCol': 'flow_out_id', 'toVertexCol': 'flow_in_id',
          'outputTableName': MAXIMAL_CONNECTED_TABLE, 'inputEdgeTableName': WEIGHTED_GRAPH_EDGE_TABLE}))
     output.persist(MAXIMAL_CONNECTED_TABLE)
    def testRepeatSetItem(self):
        df = DataFrame(self.table)

        df['rank'] = df.groupby('name').sort('id').id.rank()
        df['rank'] = df.groupby('name').sort('id').id.rank()

        self.assertEqual(len(df.execute()), 3)
Exemple #13
0
    def test_logistic_partition_df(self):
        options.ml.dry_run = True
        self.maxDiff = None

        self.create_ionosphere_one_part(IONOSPHERE_TABLE_ONE_PART)
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_ONE_PART).get_partition("part=0")) \
            .roles(label='class')

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(df)._add_case(
            self.gen_check_params_case({
                'labelColName':
                'class',
                'modelName':
                MODEL_NAME,
                'inputTableName':
                IONOSPHERE_TABLE_ONE_PART,
                'epsilon':
                '0.001',
                'inputTablePartitions':
                "part=0",
                'regularizedLevel':
                '1',
                'regularizedType':
                'l1',
                'maxIter':
                '50',
                'featureColNames':
                ','.join('a%02d' % i for i in range(1, 35))
            }))
        model.persist(MODEL_NAME)
    def testDataFrame(self):
        df = DataFrame(self.table)

        self.assertEqual(3, df.count().execute())
        self.assertEqual(1, df[df.name == 'name1'].count().execute())

        res = df[df.name.contains('中文')].execute()
        self.assertGreaterEqual(len(res), 0)
 def test_direct_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()
    def test_filter_noises(self):
        self.odps.delete_table(FILTERED_WORDS_TABLE, if_exists=True)

        self.create_splited_words(SPLITED_TABLE)
        self._create_noise_table(NOISE_TABLE)
        df = DataFrame(
            self.odps.get_table(SPLITED_TABLE)).roles(doc_content='content')
        ndf = DataFrame(self.odps.get_table(NOISE_TABLE))
        filtered = filter_noises(df, ndf)
        filtered._add_case(
            self.gen_check_params_case({
                'noiseTableName': NOISE_TABLE,
                'outputTableName': FILTERED_WORDS_TABLE,
                'selectedColNames': 'content',
                'inputTableName': SPLITED_TABLE
            }))
        filtered.persist(FILTERED_WORDS_TABLE)
 def test_count_ngram(self):
     self.create_word_triple(WORD_TRIPLE_TABLE)
     word_triple_df = DataFrame(self.odps.get_table(WORD_TRIPLE_TABLE)).select_features('word')
     counted = count_ngram(word_triple_df)
     counted._add_case(self.gen_check_params_case({
         'outputTableName': COUNT_NGRAM_TABLE, 'inputSelectedColNames': 'word', 'order': '3',
         'inputTableName': WORD_TRIPLE_TABLE}))
     counted.persist(COUNT_NGRAM_TABLE)
    def test_mat_pearson(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        matrix_pearson(df, _cases=self.gen_check_params_case(
            {'outputTableName': 'tmp_pyodps__matrix_pearson',
             'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)),
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
    def test_chisquare(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
        chi_square(df, x_col=df.a01, y_col='class', _cases=self.gen_check_params_case(
            {'yColName': 'class', 'xColName': 'a01', 'outputDetailTableName': 'tmp_pyodps__chi_square',
             'outputTableName': 'tmp_pyodps__chi_square',
             'inputTableName': tn('pyodps_test_ml_ionosphere')}))
    def test_histograms(self):
        options.ml.dry_run = True

        ds = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        histograms(ds, _cases=self.gen_check_params_case({
            'outputTableName': TEMP_TABLE_PREFIX + '_histograms',
            'selectedColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'intervalNum': '10', 'inputTableName': IONOSPHERE_TABLE}))
 def test_df_store(self):
     self.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS)
     df = DataFrame(self.odps.get_table(
         IONOSPHERE_TABLE_TWO_PARTS)).filter_parts('part1=1,part2=2')
     self.odps.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     sorted_df = df.groupby(df['class']).agg(
         df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
 def test_str_diff(self):
     self._create_str_compare_table(STR_COMP_TABLE)
     df = DataFrame(self.odps.get_table(STR_COMP_TABLE))
     diff_df = str_diff(df, col1='col1', col2='col2')
     diff_df._add_case(self.gen_check_params_case({
         'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputTableName': COMP_RESULT_TABLE,
         'inputSelectedColName2': 'col2', 'inputSelectedColName1': 'col1', 'method': 'levenshtein_sim',
         'lambda': '0.5', 'outputColName': 'output'}))
     diff_df.persist(COMP_RESULT_TABLE)
Exemple #23
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'bigint', 'double', 'boolean', 'decimal',
                      'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.tb = DataFrame(table)

        import pandas as pd

        df = pd.DataFrame([['name1', 2, 3.14], ['name2', 100, 2.7]],
                          columns=['name', 'id', 'fid'])
        self.pd = DataFrame(df)

        self.expr = self.tb.join(self.pd, on='name')

        self.engine = MixedEngine(self.odps)
    def test_quantile(self):
        options.ml.dry_run = True

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        qt = quantile(df, _cases=self.gen_check_params_case(
            {'inputTableName': tn('pyodps_test_ml_ionosphere'), 'outputTableName': tn('pyodps_test_ml_iono_quantile'),
             'colName': ','.join('a%02d' % i for i in range(1, 35)),
             'N': '100'}))
        qt.persist(IONOSPHERE_QUANTILE_TABLE)
 def test_tfidf_code(self):
     self.delete_table(TFIDF_TABLE)
     self.create_corpus(CORPUS_TABLE)
     df = DataFrame(
         self.odps.get_table(CORPUS_TABLE)).doc_content_field('content')
     pl = Pipeline(SplitWord())
     TFIDF().link(DocWordStat().link(pl).triple)
     ret_df = pl.transform(df)
     ret_df.persist(TFIDF_TABLE)
Exemple #26
0
    def testUnicodePdDataFrame(self):
        import pandas as pd

        pd_df = pd.DataFrame([['中文'], [to_text('中文2')]], columns=[to_text('字段')])
        df = DataFrame(pd_df)

        r = df['字段'].execute()
        self.assertEqual(to_text('中文'), to_text(r[0][0]))
        self.assertEqual(to_text('中文2'), to_text(r[1][0]))
 def test_tfidf_array(self):
     self.delete_table(W2V_TABLE)
     self.create_corpus(CORPUS_TABLE)
     df = DataFrame(
         self.odps.get_table(CORPUS_TABLE)).doc_content_field('content')
     pl = Pipeline(
         Pipeline(SplitWord(), (DocWordStat(), 'multi'), Word2Vec()))
     word_feature, _ = pl.transform(df)
     word_feature.persist(W2V_TABLE)
Exemple #28
0
    def testExecuteModel(self):
        from odps.ml import classifiers
        from odps.ml.expr.models.pmml import PmmlRegressionResult

        self.create_iris(IRIS_TABLE)
        df = DataFrame(self.odps.get_table(IRIS_TABLE)).roles(label='category')
        model = classifiers.LogisticRegression().train(df)
        result = model.execute()
        self.assertIsInstance(result, PmmlRegressionResult)
 def test_top_n(self):
     self._create_str_compare_table(STR_COMP_TABLE)
     df = DataFrame(self.odps.get_table(STR_COMP_TABLE))
     top_n_df = top_n_similarity(df, df, col='col1', map_col='col1')
     top_n_df._add_case(self.gen_check_params_case({
         'inputTableName': STR_COMP_TABLE, 'k': '2', 'outputColName': 'output',
         'mapSelectedColName': 'col1', 'topN': '10', 'inputSelectedColName': 'col1',
         'outputTableName': TOP_N_TABLE, 'mapTableName': self.odps.project + '.' + STR_COMP_TABLE,
         'method': 'levenshtein_sim', 'lambda': '0.5'}))
     top_n_df.persist(TOP_N_TABLE)
Exemple #30
0
    def testHeadAndTail(self):
        df = DataFrame(self.table)

        self.assertEqual(1, len(df.head(1)))
        self.assertEqual(2, len(df.head(2)))
        self.assertEqual([3, 'name3'], list(df.tail(1)[0]))

        r = df[df.name == 'name2'].head(1)
        self.assertEqual(1, len(r))
        self.assertEqual([2, 'name2'], list(r[0]))