Example #1
0
    def test_small_dataset_no_sampling(self, transaction, lmd):
        lmd['sample_settings']['sample_for_analysis'] = True
        lmd['sample_settings']['sample_margin_of_error'] = 0.95
        lmd['sample_settings']['sample_confidence_level'] = 0.05
        lmd['force_column_usage'] = []
        transaction.hmd['sample_function'] = mock.MagicMock(wraps=sample_data)

        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 50
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        type_deductor.run(input_data)

        assert transaction.hmd['sample_function'].called

        stats_v2 = lmd['stats_v2']
        assert stats_v2['numeric_int']['typing'][
            'data_type'] == DATA_TYPES.NUMERIC
        assert stats_v2['numeric_int']['typing'][
            'data_subtype'] == DATA_SUBTYPES.INT

        # This ensures that no sampling was applied
        assert stats_v2['numeric_int']['typing']['data_type_dist'][
            DATA_TYPES.NUMERIC] == 50
        assert stats_v2['numeric_int']['typing']['data_subtype_dist'][
            DATA_SUBTYPES.INT] == 50
Example #2
0
    def test_type_mix(self, transaction, lmd):
        lmd['force_column_usage'] = []
        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_float': np.linspace(0, n_points, n_points),
            },
            index=list(range(n_points)))
        input_dataframe['numeric_float'].iloc[:2] = 'random string'
        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']
        assert stats_v2['numeric_float']['typing'][
            'data_type'] == DATA_TYPES.NUMERIC
        assert stats_v2['numeric_float']['typing'][
            'data_subtype'] == DATA_SUBTYPES.FLOAT
        assert stats_v2['numeric_float']['typing']['data_type_dist'][
            DATA_TYPES.NUMERIC] == 98
        assert stats_v2['numeric_float']['typing']['data_subtype_dist'][
            DATA_SUBTYPES.FLOAT] == 98
Example #3
0
    def test_sample(self, transaction, lmd):
        lmd['sample_settings']['sample_for_analysis'] = True
        transaction.hmd['sample_function'] = mock.MagicMock(wraps=sample_data)

        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
            },
            index=list(range(n_points)))

        stats_v2 = self.get_stats_v2(input_dataframe.columns)
        lmd['stats_v2'] = stats_v2

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        data_analyzer.run(input_data)
        assert transaction.hmd['sample_function'].called

        assert sum(
            lmd['stats_v2']['numeric_int']['histogram']['y']) <= n_points

        lmd['sample_settings']['sample_for_analysis'] = False
        transaction.hmd['sample_function'] = mock.MagicMock(wraps=sample_data)

        data_analyzer.run(input_data)
        assert not transaction.hmd['sample_function'].called
Example #4
0
    def test_deduce_foreign_key(self, transaction, lmd):
        """Tests that basic cases of type deduction work correctly"""
        hmd = transaction.hmd

        lmd['force_column_usage'] = []

        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)
        n_points = 100

        input_dataframe = pd.DataFrame(
            {
                'numeric_id': list(range(n_points)),
                'uuid': [str(uuid4()) for i in range(n_points)]
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']

        assert isinstance(stats_v2['numeric_id']['identifier'], str)
        assert isinstance(stats_v2['uuid']['identifier'], str)

        assert 'numeric_id' in lmd['columns_to_ignore']
        assert 'uuid' in lmd['columns_to_ignore']
Example #5
0
    def __init__(self,
                 session,
                 light_transaction_metadata,
                 heavy_transaction_metadata,
                 logger=log):
        """
        A transaction is the interface to start some MindsDB operation within a session

        :param session:
        :param transaction_metadata: dict
        :param heavy_transaction_metadata: dict
        """

        self.session = session
        self.lmd = light_transaction_metadata
        self.lmd['created_at'] = str(datetime.datetime.now())
        self.hmd = heavy_transaction_metadata

        # variables to de defined by setup
        self.error = None
        self.errorMsg = None

        self.input_data = TransactionData()
        self.output_data = TrainTransactionOutputData()

        # variables that can be persisted

        self.log = logger

        self.run()
    def test_ignore_columns(self, transaction, lmd):
        data_cleaner = DataCleaner(session=transaction.session,
                                   transaction=transaction)

        input_dataframe = pd.DataFrame({
            'do_use': [1, 2, 3],
            'ignore_this': [0, 1, 100]
        })

        lmd['columns_to_ignore'].append('ignore_this')

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        data_cleaner.transaction.input_data = input_data
        data_cleaner.run()

        assert 'do_use' in input_data.data_frame.columns
        assert 'ignore_this' not in input_data.data_frame.columns
    def test_user_provided_null_values(self, transaction, lmd):
        data_cleaner = DataCleaner(session=transaction.session,
                                   transaction=transaction)

        input_dataframe = pd.DataFrame({
            'my_column': ['a', 'b', 'NULL', 'c', 'null', 'none', 'Null']
        })

        lmd['null_values'] = {'my_column': ['NULL', 'null', 'none', 'Null']}

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        data_cleaner.transaction.input_data = input_data
        data_cleaner.run()

        assert input_data.data_frame['my_column'].iloc[0] == 'a'
        assert input_data.data_frame['my_column'].iloc[1] == 'b'
        assert pd.isna(input_data.data_frame['my_column'].iloc[2])
        assert input_data.data_frame['my_column'].iloc[3] == 'c'
        assert pd.isna(input_data.data_frame['my_column'].iloc[4])
        assert pd.isna(input_data.data_frame['my_column'].iloc[5])
        assert pd.isna(input_data.data_frame['my_column'].iloc[6])
Example #8
0
    def test_empty_values(self, transaction, lmd):
        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
            },
            index=list(range(n_points)))

        stats_v2 = self.get_stats_v2(input_dataframe.columns)

        lmd['stats_v2'] = stats_v2

        input_dataframe['numeric_int'].iloc[::2] = None
        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        data_analyzer.run(input_data)

        stats_v2 = lmd['stats_v2']

        assert stats_v2['numeric_int']['empty']['empty_percentage'] == 50
Example #9
0
    def test_sample(self, transaction, lmd):
        lmd['sample_settings']['sample_for_analysis'] = True
        lmd['force_column_usage'] = []
        transaction.hmd['sample_function'] = mock.MagicMock(wraps=sample_data)

        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        type_deductor.run(input_data)

        assert transaction.hmd['sample_function'].called

        stats_v2 = lmd['stats_v2']
        assert stats_v2['numeric_int']['typing'][
            'data_type'] == DATA_TYPES.NUMERIC
        assert stats_v2['numeric_int']['typing'][
            'data_subtype'] == DATA_SUBTYPES.INT
        assert stats_v2['numeric_int']['typing']['data_type_dist'][
            DATA_TYPES.NUMERIC] <= n_points
        assert stats_v2['numeric_int']['typing']['data_subtype_dist'][
            DATA_SUBTYPES.INT] <= n_points

        lmd['sample_settings']['sample_for_analysis'] = False
        transaction.hmd['sample_function'] = mock.MagicMock(wraps=sample_data)

        type_deductor.run(input_data)
        assert not transaction.hmd['sample_function'].called
Example #10
0
    def test_groups(self, transaction, lmd):
        data_splitter = DataSplitter(session=transaction.session,
                                     transaction=transaction)

        ob = [*range(100)]
        random.shuffle(ob)

        input_dataframe = pd.DataFrame({
            'ob': ob,
            'gb_1': [1, 1, 2, 2] * 25,
            'gb_2': [1, 2, 1, 2] * 25
        })

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        data_splitter.transaction.input_data = input_data
        all_indexes, *_ = data_splitter.run(test_train_ratio=0.25)

        assert len(all_indexes[(1, 1)]) == 25
        assert len(all_indexes[(1, 2)]) == 25
        assert len(all_indexes[(2, 1)]) == 25
        assert len(all_indexes[(2, 2)]) == 25
        assert len(all_indexes[tuple()]) == 100
Example #11
0
    def test_guess_probability(self, transaction, lmd):
        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        input_dataframe = pd.DataFrame(
            {
                'categorical_int': [1, 2, 1, 3, 4, 3, 2, 4, 5, 1, 2, 3],
                'categorical_int': [2, 1, 3, 4, 3, 2, 4, 5, 1, 2, 1, 2],
                'categorical_binary': [
                    'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'cat',
                    'cat', 'cat', 'cat', 'dog'
                ]
            },
            index=[*range(12)])

        stats_v2 = self.get_stats_v2(input_dataframe.columns)
        lmd['stats_v2'] = stats_v2

        input_data = TransactionData()
        input_data.data_frame = input_dataframe

        data_analyzer.run(input_data)
        assert data_analyzer.transaction.lmd['stats_v2']['categorical_binary'][
            'guess_probability'] == (9 / 12)**2 + (3 / 12)**2
Example #12
0
    def test_type_deduction(self, transaction, lmd):
        """Tests that basic cases of type deduction work correctly"""
        hmd = transaction.hmd
        lmd['force_column_usage'] = []
        lmd['force_column_usage'] = []
        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100

        # Apparently for n_category_values = 10 it doesnt work
        n_category_values = 4
        categories_cycle = cycle(range(n_category_values))
        n_multilabel_category_values = 25
        multiple_categories_str_cycle = cycle(
            random.choices(VOCAB[0:20], k=n_multilabel_category_values))

        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
                'numeric_float':
                np.linspace(0, n_points, n_points),
                'date_timestamp': [
                    (datetime.now() - timedelta(minutes=int(i))).isoformat()
                    for i in range(n_points)
                ],
                'date_date': [
                    (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d')
                    for i in range(n_points)
                ],
                'categorical_str': [
                    f'category_{next(categories_cycle)}'
                    for i in range(n_points)
                ],
                'categorical_int':
                [next(categories_cycle) for i in range(n_points)],
                'categorical_binary': [0, 1] * (n_points // 2),
                'sequential_array':
                [f"1,2,3,4,5,{i}" for i in range(n_points)],
                'multiple_categories_array_str': [
                    ",".join([
                        f'{next(multiple_categories_str_cycle)}'
                        for j in range(random.randint(1, 6))
                    ]) for i in range(n_points)
                ],
                'short_text':
                generate_short_sentences(n_points),
                'rich_text':
                generate_rich_sentences(n_points)
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']

        for col_name in input_dataframe.columns:
            expected_type = test_column_types[col_name][0]
            expected_subtype = test_column_types[col_name][1]
            assert stats_v2[col_name]['typing']['data_type'] == expected_type
            assert stats_v2[col_name]['typing'][
                'data_subtype'] == expected_subtype
            assert stats_v2[col_name]['typing']['data_type_dist'][
                expected_type] == 100
            assert stats_v2[col_name]['typing']['data_subtype_dist'][
                expected_subtype] == 100

        for col_name in stats_v2['columns']:
            if col_name in lmd['columns_to_ignore']:
                continue
            assert stats_v2[col_name]['identifier'] is None

        assert DATA_SUBTYPES.INT in stats_v2['categorical_int'][
            'additional_info']['other_potential_subtypes']
        assert hmd == {}

        assert isinstance(json.dumps(transaction.lmd), str)
        assert set(transaction.lmd['stats_v2']['columns']) == set(
            input_dataframe.columns)
Example #13
0
    def test_data_analysis(self, transaction, lmd):
        """Tests that data analyzer doesn't crash on common types"""
        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        n_category_values = 4
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
                'numeric_float':
                np.linspace(0, n_points, n_points),
                'date_timestamp':
                [(datetime.now() - timedelta(minutes=int(i))).isoformat()
                 for i in range(n_points)],
                'date_date': [
                    (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d')
                    for i in range(n_points)
                ],
                'categorical_str': [
                    f'a{x}' for x in (list(range(n_category_values)) *
                                      (n_points // n_category_values))
                ],
                'categorical_binary': [0, 1] * (n_points // 2),
                'categorical_int': [
                    x for x in (list(range(n_category_values)) *
                                (n_points // n_category_values))
                ],
                'sequential_array':
                [f"1,2,3,4,5,{i}" for i in range(n_points)],
                'short_text':
                generate_short_sentences(n_points),
                'rich_text':
                generate_rich_sentences(n_points)
            },
            index=list(range(n_points)))

        stats_v2 = self.get_stats_v2(input_dataframe.columns)

        lmd['stats_v2'] = stats_v2
        hmd = transaction.hmd

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        data_analyzer.run(input_data)

        stats_v2 = lmd['stats_v2']

        for col_name in input_dataframe.columns:
            assert stats_v2[col_name]['empty']['empty_percentage'] == 0
            assert not stats_v2[col_name]['empty']['is_empty']
            assert stats_v2[col_name]['histogram']
            assert 'percentage_buckets' in stats_v2[col_name]
            assert stats_v2[col_name]['bias']['entropy']

        assert stats_v2['categorical_str']['unique']['unique_values']
        assert stats_v2['categorical_str']['unique'][
            'unique_percentage'] == 4.0

        # Assert that the histogram on text field is made using words
        assert isinstance(stats_v2['short_text']['histogram']['x'][0], str)
        assert isinstance(stats_v2['rich_text']['histogram']['x'][0], str)

        for col in ['numeric_float', 'numeric_int']:
            assert isinstance(stats_v2[col]['outliers']['outlier_values'],
                              list)
            assert isinstance(stats_v2[col]['outliers']['outlier_buckets'],
                              list)
            assert isinstance(stats_v2[col]['outliers']['description'], str)
            assert set(stats_v2[col]['outliers']['outlier_buckets']) <= set(
                stats_v2[col]['percentage_buckets'])

        assert hmd == {}

        assert isinstance(json.dumps(transaction.lmd), str)