Python estimate_pairwise_similarity Examples, bdbcontrib.parallel.estimate_pairwise_similarity Python Examples

Example #1

0

Show file

File: test_parallel.py Project: jayelm/bdbcontrib

def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', N=0
        )
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=N, overwrite=True
            )
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')
            ).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=41, overwrite=True
            )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)

Example #2

0

Show file

File: test_parallel.py Project: vishalbelsare/bdbcontrib

def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0)
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=N,
                                                  overwrite=True)
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=41,
                                                  overwrite=True)

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)

Example #3

0

Show file

File: test_parallel.py Project: jayelm/bdbcontrib

def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc'
        )

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', cores=0
            )

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc'
            )
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 'foo', 'foo_cc'
            )
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(
                    does_not_exist.name, 't', 't_cc'
                )

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', overwrite=True
        )

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', sim_table='t_similarity_2'
        )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim_2 = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity_2')
        ).sort_values(by=['rowid0', 'rowid1'])

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(
            parallel_sim, parallel_sim_2, check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)

Example #4

0

Show file

File: test_parallel.py Project: vishalbelsare/bdbcontrib

def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_bql_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  cores=0)

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 'foo',
                                                  'foo_cc')
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(does_not_exist.name, 't',
                                                      't_cc')

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              overwrite=True)

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              sim_table='t_similarity_2')

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim_2 = cursor_to_df(
            bdb.execute(
                'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1'))

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(parallel_sim,
                           parallel_sim_2,
                           check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)