Ejemplo n.º 1
0
    def test_column_names(self):
        sales = {
            'brand': [
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC"
            ] * 10,
            'payment': [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10,
            'description':
            ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"] * 10
        }
        df1 = pd.DataFrame.from_dict(sales)
        df2 = df1

        store = Store(df1, df2)

        with self.subTest("Test access with column_type"):
            self.assertEqual(['payment'],
                             store.column_names(ColumnType.numerical))
            self.assertCountEqual(['brand', 'description'],
                                  store.column_names(ColumnType.categorical,
                                                     ColumnType.text))

        with self.subTest("Test access without specifying column_type"):
            self.assertCountEqual(['brand', 'payment', 'description'],
                                  store.column_names())

        with self.subTest("Incorrect column_types"):
            self.assertRaises(
                TypeError,
                lambda: store(ColumnType.numerical, 'no_column_type'))
Ejemplo n.º 2
0
    def __init__(self,
                 df1: Union[pd.DataFrame, str],
                 df2: Union[pd.DataFrame, str],
                 delimiter=',',
                 log_print=True,
                 **custom_column_types):
        if type(df1) is pd.DataFrame:
            self.df1 = df1
        elif type(df1) is str:
            self.df1 = read_from_csv(df1, delimiter)
        else:
            raise Exception("df1 is not a dataframe or a string")

        if type(df2) is pd.DataFrame:
            self.df2 = df2
        elif type(df2) is str:
            self.df2 = read_from_csv(df2, delimiter)
        else:
            raise Exception("df2 is not a dataframe or a string")

        self.log_print = log_print
        self.check_reports = []
        self.store = Store(self.df1,
                           self.df2,
                           log_print=self.log_print,
                           custom_column_types=custom_column_types)

        lprint(
            "Used columns: {}".format(', '.join(
                column_names(self.store.column_names()))), self.log_print)
Ejemplo n.º 3
0
    def setUp(self):
        sales1 = {'shift': ['A'] * 100, 'no_shift': ['C'] * 100}
        sales2 = {'shift': ['B'] * 100, 'no_shift': ['C'] * 100}
        numbers1 = {'cool_numbers': [1, 2, 3, 4] * 10}
        numbers2 = {'cool_numbers': [1, 2, 3, 6] * 10}

        self.df1 = DataFrame.from_dict(sales1)
        self.df2 = DataFrame.from_dict(sales2)
        self.df1_num = DataFrame.from_dict(numbers1)
        self.df2_num = DataFrame.from_dict(numbers2)

        self.store = Store(self.df1, self.df2)
        self.store_num = Store(self.df1_num, self.df2_num)
        self.check = DQMetricsCheck()
Ejemplo n.º 4
0
    def setUp(self):
        self.df1 = pd.DataFrame({
            'col1': [
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef'
            ],
            'col2': [
                'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
                'z1', '23', '45', '67', '89', '10', '11', '12', '13', '14'
            ]
        })

        self.df2 = pd.DataFrame({
            'col1': [
                'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
                'z1', '23', '45', '67', '89', '10', '11', '12', '13', '14'
            ],
            'col2': [
                'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
                'z1', '23', '45', '67', '89', '10', '11', '12', '13', '15'
            ]
        })

        self.store = Store(self.df1, self.df2)
        self.report = EmbeddingDistanceCheck(model='word2vec').run(self.store)
Ejemplo n.º 5
0
    def setUp(self):

        # set hash seed
        os.environ['PYTHONHASHSEED'] = "0"

        # create alphabet list: ['a', 'b', ..., 'z']
        alphabet = [chr(letter) for letter in range(ord('a'), ord('z') + 1)]

        col = []
        # creates lists of size 7 with consecutive letters from the alphabet
        # result: ['a b c d e f g', 'b c d e f g h', ...]
        for idx in range(len(alphabet) - 7):
            col.append(' '.join(alphabet[idx:idx + 7]))

        error_col = []
        for idx in range(len(col)):
            error_col += [alphabet[idx]]

        data1 = {'shift': col, 'no_shift': col, 'num_col': error_col}
        data2 = {
            'shift': ['B B B B B B'] * len(col),
            'no_shift': col,
            'num_col': error_col
        }
        self.df1 = DataFrame.from_dict(data1)
        self.df2 = DataFrame.from_dict(data2)
        self.store = Store(self.df1, self.df2)
        self.precalculation_shift = WordPredictionPrecalculation(
            'shift', num_epochs_predictor=10, ft_workers=2, seed=1)
        self.precalculation_no_shift = WordPredictionPrecalculation(
            'no_shift', num_epochs_predictor=10, ft_workers=1, seed=1)
        self.precalculation_error = WordPredictionPrecalculation(
            'num_col', num_epochs_predictor=10, ft_workers=1, seed=1)
Ejemplo n.º 6
0
    def setUp(self):
        self.n = 5
        self.df1 = DataFrame({'num': np.arange(1, 101), 'cat': [0] * 100})
        self.df2 = DataFrame({'num': np.arange(101, 201), 'cat': [0] * 100})

        self.store = Store(self.df1, self.df2)
        self.precalculation = BinningPrecalculation(5)
Ejemplo n.º 7
0
    def setUp(self):

        # set hash seed
        os.environ['PYTHONHASHSEED'] = "0"

        alphabet = [chr(letter) for letter in range(ord('a'), ord('z') + 1)]

        col = []
        for idx in range(len(alphabet) - 7):
            col.append(' '.join(alphabet[idx:idx + 7]))

        col_too_short = [alphabet[i] for i in range(len(col))]

        data1 = {'shift': col, 'no_shift': col, 'too_short': col_too_short}
        data2 = {
            'shift': ['B B B B B B'] * len(col),
            'no_shift': col,
            'too_short': col_too_short
        }
        self.df1 = DataFrame.from_dict(data1)
        self.df2 = DataFrame.from_dict(data2)
        self.store = Store(self.df1, self.df2)
        self.check_automatic_col_detection = WordPredictionCheck(
            relative_thresh=.15, ft_size=10, ft_workers=1, seed=1)
        self.check_custom_cols = WordPredictionCheck(
            columns=['shift', 'no_shift'],
            relative_thresh=.15,
            ft_size=10,
            ft_workers=1,
            seed=1)
Ejemplo n.º 8
0
 def setUp(self):
     sales1 = {'shift': ['A'] * 100, 'no_shift': ['C'] * 100}
     sales2 = {'shift': ['B'] * 100, 'no_shift': ['C'] * 100}
     self.df1 = DataFrame.from_dict(sales1)
     self.df2 = DataFrame.from_dict(sales2)
     self.store = Store(self.df1, self.df2)
     self.check = conditional_probabilities_check.ConditionalProbabilitiesCheck(
     )
Ejemplo n.º 9
0
 def setUp(self):
     sales1 = {'shift': ['A'] * 100, 'no_shift': ['C'] * 100}
     sales2 = {'shift': ['B'] * 100, 'no_shift': ['C'] * 100}
     self.df1 = DataFrame.from_dict(sales1)
     self.df2 = DataFrame.from_dict(sales2)
     self.store = Store(self.df1, self.df2)
     self.precalculation = DistinctionPrecalculation(['shift', 'no_shift'],
                                                     num_epochs=10)
Ejemplo n.º 10
0
 def test_significant(self):
     df1 = pd.DataFrame(self.significant_1)
     df2 = pd.DataFrame(self.significant_2)
     store = Store(df1, df2)
     result = NumericalStatisticalCheck().run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(1, len(result.shifted_columns))
     self.assertEqual(1, len(result.explanation))
Ejemplo n.º 11
0
 def test_not_significant(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems})
     df2 = pd.DataFrame.from_dict({'text': list(reversed(self.poems))})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck().run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(0, len(result.shifted_columns))
     self.assertEqual(0, len(result.explanation))
Ejemplo n.º 12
0
 def test_column_order_in_report(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems, 'abc': self.poems})
     df2 = pd.DataFrame.from_dict({'text': self.phrases, 'abc': self.phrases})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck([NumCharsMetadata()]).run(store)
     self.assertEqual('abc', result.examined_columns[0])
     self.assertEqual('abc', result.shifted_columns[0])
     self.assertEqual(result.examined_columns, result.shifted_columns)
Ejemplo n.º 13
0
 def test_error_on_small_dataframe(self):
     df3 = pd.DataFrame({
         'col1': [
             'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
             '12', '34'
         ]
     })
     store2 = Store(self.df1, df3)
     self.assertRaises(ValueError, lambda: self.te2.process(store2))
Ejemplo n.º 14
0
 def test_significant(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems})
     df2 = pd.DataFrame.from_dict({'text': self.phrases})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck([NumCharsMetadata(), NumWordsMetadata(),
                                            DistinctWordsRatioMetadata(), LanguagePerParagraph()]
                                           ).run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(1, len(result.shifted_columns))
     self.assertEqual(1, len(result.explanation))
Ejemplo n.º 15
0
 def test_column_order_in_report(self):
     df1 = pd.DataFrame([[1, 0]] * 10, columns=['abc', 'def'])
     df2 = pd.DataFrame([[0, 1]] * 10, columns=['abc', 'def'])
     store = Store(df1, df2)
     for check in [CategoricalStatisticalCheck(), NumericalStatisticalCheck()]:
         with self.subTest(check=check):
             result = check.run(store)
             self.assertEqual('abc', result.examined_columns[0])
             self.assertEqual('abc', result.shifted_columns[0])
             self.assertEqual(result.examined_columns, result.shifted_columns)
Ejemplo n.º 16
0
    def setUp(self):
        self.df1 = pd.DataFrame({'col1': ['ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                                          'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                                          'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                                          'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef']})
        self.df2 = pd.DataFrame({'col1': ['ab c', 'hij', 'jkl', 'lmn', 'nop', 'pqr', 'rst', 'tuv',
                                          'vwx', 'xyz', 'z12', '234', '456', '678', '890', 'zyx',
                                          'xwv', 'vut', 'tsr', 'rqp']})

        self.store = Store(self.df1, self.df2)
        self.report = SorensenDiceCheck(ngram_type=NGramType.character, n=3).run(self.store)
Ejemplo n.º 17
0
 def test_error_on_small_dataframe(self):
     df3 = pd.DataFrame({
         'col1': [
             'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
             '12', '34'
         ]
     })
     store2 = Store(self.df1, df3)
     self.assertRaises(
         ValueError,
         lambda: SorensenDicePrecalculations(ngram_type=NGramType.character,
                                             n=3).process(store2))
Ejemplo n.º 18
0
    def setUp(self):
        self.count1 = CountVectorizer(columns=['col1'], stop_words='english', max_features=2)
        self.count2 = CountVectorizer(columns=['col1'], stop_words='english', max_features=2)
        self.count3 = CountVectorizer(columns=['col1'], stop_words='english', max_features=3)

        self.df1 = pd.DataFrame({'col1':
                                ['duck', 'duck', 'duck', 'duck', 'duck',
                                 'duck', 'duck', 'duck', 'duck', 'goose']})
        self.df2 = pd.DataFrame({'col1':
                                ['goose', 'goose', 'goose', 'goose', 'goose',
                                 'goose', 'goose', 'goose', 'goose', 'duck']})
        self.store = Store(self.df1, self.df2)
Ejemplo n.º 19
0
    def setUp(self) -> None:
        self.precalculation = DQMetricsPrecalculation()

        numerical_df_1 = pd.DataFrame.from_dict({
            'col_1': range(100),
            'col_2': list(range(50)) * 2,
            'col_3': range(0, 200, 2)
        })
        numerical_df_2 = pd.DataFrame.from_dict({
            'col_1':
            range(1, 101),
            'col_2': [8] + [None] * 99,
            'col_3':
            list(range(50, 100)) * 2
        })
        categorical_df_1 = pd.DataFrame(
            ['red', 'blue', 'blue', 'green', 'green', 'green'] * 20)
        categorical_df_2 = pd.DataFrame(
            ['red', 'green', 'green', 'green', 'green', 'green'] * 20)

        self.store_numerical = Store(numerical_df_1, numerical_df_2)
        self.store_categorical = Store(categorical_df_1, categorical_df_2)
Ejemplo n.º 20
0
 def test_min_data_size_is_enforced(self):
     df1 = pd.DataFrame(list(range(10)))
     df2 = pd.DataFrame(list(range(10)))
     store = Store(df1=df1, df2=df2)
     assert_frame_equal(df1.astype(float), store[ColumnType.numerical][0])
     assert_frame_equal(df2.astype(float), store[ColumnType.numerical][1])
     self.assertRaises(InsufficientDataError,
                       Store,
                       df1=pd.DataFrame(),
                       df2=pd.DataFrame([0]))
     self.assertRaises(InsufficientDataError,
                       Store,
                       df1=pd.DataFrame(list(range(9))),
                       df2=pd.DataFrame(list(range(20))))
Ejemplo n.º 21
0
    def test_init_custom_column_types(self):
        sales = {
            'brand': [
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC"
            ] * 10,
            'payment': [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10,
            'description':
            ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"] * 10
        }
        df1 = df2 = pd.DataFrame.from_dict(sales)

        with self.subTest("Successful initialisation"):
            Store(df1,
                  df2,
                  custom_column_types={'description': ColumnType.categorical})

        with self.subTest(
                "Exception when no dict is passed as custom_column_types"):
            self.assertRaises(
                TypeError,
                lambda: Store(df1, df2, custom_column_types='no_dict'))

        with self.subTest(
                "Exception when key of custom_column_types is not a string"):
            self.assertRaises(
                TypeError, lambda: Store(
                    df1, df2, custom_column_types={0: ColumnType.numerical}))

        with self.subTest(
                "Exception when value of custom_column_types is not a ColumnType"
        ):
            self.assertRaises(
                TypeError,
                lambda: Store(df1, df2, custom_column_types={'brand': 0}))
Ejemplo n.º 22
0
    def test_change_column_type(self):
        data = {
            'to_numerical':
            ['a', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1'] *
            10
        }
        df1 = df2 = pd.DataFrame.from_dict(data)
        custom_column_types = {'to_numerical': ColumnType.numerical}

        with self.subTest(
                "Exception when trying to convert non-numerical column to numerical"
        ):
            self.assertRaises(
                Exception, lambda: Store(
                    df1, df2, custom_column_types=custom_column_types))
Ejemplo n.º 23
0
 def setUp(self):
     sales1 = {
         'shift': ['A'] * 100,
         'small_shift': ['A', 'B'] * 50,
         'no_shift': ['C'] * 100
     }
     sales2 = {
         'shift': ['B'] * 100,
         'small_shift': ['B', 'C'] * 50,
         'no_shift': ['C'] * 100
     }
     self.df1 = DataFrame.from_dict(sales1)
     self.df2 = DataFrame.from_dict(sales2)
     self.store = Store(self.df1, self.df2)
     self.check = DistinctionCheck()
Ejemplo n.º 24
0
    def test_process(self):
        sales = {
            'brand': [
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC", "Alpha Co", "Blue Inc", "Blue Inc", "Alpha Co",
                "Jones LLC"
            ] * 10,
            'payment': [
                150.0, 200.0, 50.0, 10.0, 5.0, 150.0, 200.0, 50.0, 10.0, 5.0,
                1.0
            ] * 10,
            'description':
            ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"] * 10
        }
        df1 = DataFrame.from_dict(sales)
        df2 = df1

        store = Store(df1, df2)
        df1_processed, _, columns = self.precalculation.process(store)
        self.assertCountEqual(['brand', 'payment'], columns)
        self.assertTrue(df1_processed.equals(df1[['brand', 'payment']]))
Ejemplo n.º 25
0
    def test_quartile_metrics(self):
        numerical_df_1 = pd.DataFrame(list(range(12)), columns=['col_1'])
        numerical_df_2 = pd.DataFrame(list(range(1, 20, 2)), columns=['col_1'])
        new_numerical_store = Store(numerical_df_1, numerical_df_2)

        comparison_numeric = self.precalculation.process(
            new_numerical_store)['numerical_comparison']
        self.assertAlmostEqual(
            comparison_numeric['col_1']['quartile_1']['df1'],
            numerical_df_1['col_1'].quantile(.25))
        self.assertEqual(comparison_numeric['col_1']['quartile_3']['df1'],
                         numerical_df_1['col_1'].quantile(.75))
        self.assertEqual(comparison_numeric['col_1']['median']['df1'],
                         numerical_df_1['col_1'].quantile(.5))
        self.assertAlmostEqual(
            comparison_numeric['col_1']['quartile_1']['df2'],
            numerical_df_2['col_1'].quantile(.25))
        self.assertEqual(comparison_numeric['col_1']['quartile_3']['df2'],
                         numerical_df_2['col_1'].quantile(.75))
        self.assertEqual(comparison_numeric['col_1']['median']['df2'],
                         numerical_df_2['col_1'].quantile(.5))
Ejemplo n.º 26
0
    def setUp(self):
        self.df1 = pd.DataFrame({
            'col1': [
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef'
            ]
        })
        self.df2 = pd.DataFrame({
            'col1': [
                'ab ', 'hi ', 'jk ', 'lm ', 'no ', 'pq ', 'rs ', 'tu ', 'vw ',
                'xy ', 'z1 ', '23 ', '45 ', '67 ', '89 '
            ]
        })

        self.wordng1 = NGram(1, NGramType.word)
        self.wordng2 = NGram(1, NGramType.word)
        self.wordng3 = NGram(2, NGramType.word)
        self.charng1 = NGram(1, NGramType.character)
        self.charng2 = NGram(5, NGramType.character)

        self.store = Store(self.df1, self.df2)
Ejemplo n.º 27
0
    def setUp(self):
        self.df1 = pd.DataFrame({
            'col1': [
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef',
                'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef', 'ab cd ef'
            ]
        })
        self.df2 = pd.DataFrame({
            'col1': [
                'ab', 'hi', 'jk', 'lm', 'no', 'pq', 'rs', 'tu', 'vw', 'xy',
                'z1', '23', '45', '67', '89', '10', '11', '12', '13', '14'
            ]
        })

        self.store = Store(self.df1, self.df2)

        w2v = Word2Vec(size=50, window=5, min_count=1, workers=4)
        self.te1 = EmbeddingDistancePrecalculation(model='word2vec')
        self.te2 = EmbeddingDistancePrecalculation(model='word2vec')
        self.te3 = EmbeddingDistancePrecalculation(model='fasttext')
        self.te4 = EmbeddingDistancePrecalculation(trained_model=w2v)
        self.te5 = EmbeddingDistancePrecalculation(trained_model=w2v)
Ejemplo n.º 28
0
class Detector:
    """The detector object acts as the central object.
    It is passed the data frames you want to compare.

    :param df1: either a pandas data frame or a file path
    :param df2: either a pandas data frame or a file path
    :param delimiter: delimiter for csv files
    """
    def __init__(self,
                 df1: Union[pd.DataFrame, str],
                 df2: Union[pd.DataFrame, str],
                 delimiter=',',
                 log_print=True,
                 **custom_column_types):
        if type(df1) is pd.DataFrame:
            self.df1 = df1
        elif type(df1) is str:
            self.df1 = read_from_csv(df1, delimiter)
        else:
            raise Exception("df1 is not a dataframe or a string")

        if type(df2) is pd.DataFrame:
            self.df2 = df2
        elif type(df2) is str:
            self.df2 = read_from_csv(df2, delimiter)
        else:
            raise Exception("df2 is not a dataframe or a string")

        self.log_print = log_print
        self.check_reports = []
        self.store = Store(self.df1,
                           self.df2,
                           log_print=self.log_print,
                           custom_column_types=custom_column_types)

        lprint(
            "Used columns: {}".format(', '.join(
                column_names(self.store.column_names()))), self.log_print)

    def run(self, *checks, logger_level=logger.ERROR):
        """
        Run the Detector with the checks to run.
        :param checks: checks to run
        :param logger_level: level of logging
        """
        logger.getLogger().setLevel(logger_level)

        if not checks:
            raise Exception("Please include checks)")

        if not all(isinstance(check, Check) for check in checks):
            class_names = map(lambda c: c.__class__.__name__, checks)
            raise Exception(
                "All elements in checks should be a Check. Received: {}".
                format(', '.join(class_names)))

        check_reports = []
        for check in checks:
            lprint("Executing {}".format(check.__class__.__name__),
                   self.log_print)
            try:
                report = check.run(self.store)
                check_reports.append(report)
            except Exception as e:
                error_msg = {e.__class__.__name__: str(e)}
                error_report = Report(check.__class__.__name__,
                                      examined_columns=[],
                                      shifted_columns=[],
                                      information=error_msg)
                check_reports.append(error_report)
        self.check_reports = check_reports

    def evaluate(self):
        """
        Evaluate the reports.
        """
        nprint("OVERVIEW", text_formatting='h1')
        nprint("Executed {} check{}".format(
            len(self.check_reports),
            's' if len(self.check_reports) > 1 else ''))

        detected = defaultdict(int)
        examined = defaultdict(int)
        check_names = {}

        for report in self.check_reports:
            check_names[report.check_name] = []
            for shifted_column in report.shifted_columns:
                detected[shifted_column] += 1
                check_names[report.check_name].append(shifted_column)
            for examined_column in report.examined_columns:
                examined[examined_column] += 1

        def sort_key(t):
            """
            Sort descending with respect to number of failed checks.
            If two checks have the same number of failed, sort ascending
            with respect to the number of number of executed checks.
            """
            _, num_detected, num_examined = t

            return -num_detected, num_examined

        #sorted_summary = sorted(((col, detected[col], examined[col]) for col in examined), key=sort_key)

        #df_summary = pd.DataFrame(sorted_summary, columns=['Column', '# Shifts detected', '# Checks Executed'])

        check_matrix = [None] * len(check_names.keys())
        for i, check in enumerate(check_names.keys()):
            check_list = [None] * len(examined)
            for j, col in enumerate(examined):
                if col in check_names[check]:
                    check_list[j] = 1
                else:
                    check_list[j] = 0
            check_matrix[i] = check_list

        custom_cmap = colors.ListedColormap(['green', 'red'])

        fig, ax = plt.subplots()
        im = ax.imshow(check_matrix,
                       cmap=custom_cmap,
                       interpolation='none',
                       vmin=0,
                       vmax=1)

        # We want to show all ticks...
        ax.set_xticks(np.arange(len(examined)))
        ax.set_yticks(np.arange(len(check_names.keys())))
        # ... and label them with the respective list entries
        ax.set_xticklabels(examined)
        ax.set_yticklabels(check_names.keys())

        plt.setp(ax.get_xticklabels(),
                 rotation=45,
                 ha="right",
                 rotation_mode="anchor")

        # Minor ticks
        ax.set_xticks(np.arange(-.5, len(examined), 1), minor=True)
        ax.set_yticks(np.arange(-.5, len(check_names.keys()), 1), minor=True)
        ax.grid(which='minor', color='k', linestyle='-', linewidth=2)

        display(plt.show())

        nprint("DETAILS", text_formatting='h1')
        for report in self.check_reports:
            report.print_report()
            for fig in report.figures:
                fig()
Ejemplo n.º 29
0
 def setUp(self):
     poems = td.poems
     phrases = td.phrases
     df1 = pd.DataFrame.from_dict({'text': poems})
     df2 = pd.DataFrame.from_dict({'text': phrases})
     self.store = Store(df1, df2)
Ejemplo n.º 30
0
    def setUp(self):
        self.lda_report1 = LdaCheck(shift_threshold=0.1,
                                    n_topics=2,
                                    n_iter=1,
                                    random_state=0,
                                    lib='sklearn')
        self.lda_report2 = LdaCheck(shift_threshold=0.1,
                                    n_topics=2,
                                    n_iter=1,
                                    random_state=0,
                                    lib='sklearn')
        self.lda_report3 = LdaCheck(shift_threshold=0.1,
                                    n_topics=2,
                                    n_iter=8,
                                    random_state=0,
                                    lib='gensim',
                                    columns=['text'])
        self.lda_report4 = LdaCheck(shift_threshold=0.1,
                                    n_topics=2,
                                    n_iter=8,
                                    random_state=0,
                                    lib='gensim',
                                    columns=['abcd'])
        self.lda_report5 = LdaCheck(shift_threshold=0.1,
                                    n_topics='auto',
                                    n_iter=1,
                                    random_state=0,
                                    lib='sklearn',
                                    columns=['text'])
        self.lda_report6 = LdaCheck(shift_threshold=0.1,
                                    n_topics='auto',
                                    n_iter=1,
                                    random_state=0,
                                    lib='gensim',
                                    columns=['text'])

        self.poems = [
            'Tell me not, in mournful numbers,\nLife is but an empty dream!\nFor the soul is dead that slumbers,\nAnd things are not what they seem.',
            'Life is real! Life is earnest!\nAnd the grave is not its goal;\nDust thou art, to dust returnest,\nWas not spoken of the soul.',
            'Not enjoyment, and not sorrow,\nIs our destined end or way;\nBut to act, that each to-morrow\nFind us farther than to-day.',
            'Art is long, and Time is fleeting,\nAnd our hearts, though stout and brave,\nStill, like muffled drums, are beating\nFuneral marches to the grave.',
            'In the world’s broad field of battle,\nIn the bivouac of Life,\nBe not like dumb, driven cattle!\nBe a hero in the strife! ',
            'Trust no Future, howe’er pleasant!\nLet the dead Past bury its dead!\nAct,— act in the living Present!\nHeart within, and God o’erhead! ',
            'LIFE, believe, is not a dream\nSo dark as sages say;\nOft a little morning rain\nForetells a pleasant day.\nSometimes there are clouds of gloom,\nBut these are transient all;\nIf the shower will make the roses bloom,\nO why lament its fall ? ',
            "Rapidly, merrily,\nLife's sunny hours flit by,\nGratefully, cheerily,\nEnjoy them as they fly !",
            "What though Death at times steps in\nAnd calls our Best away ?\nWhat though sorrow seems to win,\nO'er hope, a heavy sway ?\nYet hope again elastic springs,\nUnconquered, though she fell;\nStill buoyant are her golden wings,\nStill strong to bear us well.\nManfully, fearlessly,\nThe day of trial bear,\nFor gloriously, victoriously,\nCan courage quell despair ! ",
            'When sinks my heart in hopeless gloom,\nAnd life can shew no joy for me;\nAnd I behold a yawning tomb,\nWhere bowers and palaces should be;\nIn vain you talk of morbid dreams;\nIn vain you gaily smiling say,\nThat what to me so dreary seems,\nThe healthy mind deems bright and gay.',
            "I too have smiled, and thought like you,\nBut madly smiled, and falsely deemed:\nTruth led me to the present view,\nI'm waking now -- 'twas then I dreamed. ",
            'I lately saw a sunset sky,\nAnd stood enraptured to behold\nIts varied hues of glorious dye:\nFirst, fleecy clouds of shining gold; ',
            'These blushing took a rosy hue;\nBeneath them shone a flood of green;\nNor less divine, the glorious blue\nThat smiled above them and between.',
            'I cannot name each lovely shade;\nI cannot say how bright they shone;\nBut one by one, I saw them fade;\nAnd what remained whey they were gone?',
            "Dull clouds remained, of sombre hue,\nAnd when their borrowed charm was o'er,\nThe azure sky had faded too,\nThat smiled so softly bright before. ",
            'So, gilded by the glow of youth,\nOur varied life looks fair and gay;\nAnd so remains the naked truth,\nWhen that false light is past away. ',
            'Why blame ye, then, my keener sight,\nThat clearly sees a world of woes,\nThrough all the haze of golden light,\nThat flattering Falsehood round it throws? ',
            'When the young mother smiles above\nThe first-born darling of her heart,\nHer bosom glows with earnest love,\nWhile tears of silent transport start. ',
            'Fond dreamer! little does she know\nThe anxious toil, the suffering,\nThe blasted hopes, the burning woe,\nThe object of her joy will bring. ',
            'Her blinded eyes behold not now\nWhat, soon or late, must be his doom;\nThe anguish that will cloud his brow,\nThe bed of death, the dreary tomb. ',
            'As little know the youthful pair,\nIn mutual love supremely blest,\nWhat weariness, and cold despair,\nEre long, will seize the aching breast. ',
            'And, even, should Love and Faith remain,\n(The greatest blessings life can show,)\nAmid adversity and pain,\nTo shine, throughout with cheering glow; ',
            'They do not see how cruel Death\nComes on, their loving hearts to part:\nOne feels not now the gasping breath,\nThe rending of the earth-bound heart, --',
            "The soul's and body's agony,\nEre she may sink to her repose,\nThe sad survivor cannot see\nThe grave above his darling close;",
            'Nor how, despairing and alone,\nHe then must wear his life away;\nAnd linger, feebly toiling on,\nAnd fainting, sink into decay. ',
            'Oh, Youth may listen patiently,\nWhile sad Experience tells her tale;\nBut Doubt sits smiling in his eye,\nFor ardent Hope will still prevail!',
            "He hears how feeble Pleasure dies,\nBy guilt destroyed, and pain and woe;\nHe turns to Hope -\xad and she replies,\n'Believe it not -\xad it is not so!' ",
            "Oh, heed her not!' Experience says,\n'For thus she whispered once to me;\nShe told me, in my youthful days,\nHow glorious manhood's prime would be. ",
            "When, in the time of early Spring,\nToo chill the winds that o'er me pass'd,\nShe said, each coming day would bring\nA fairer heaven, a gentler blast. ",
            "And when the sun too seldom beamed,\nThe sky, o'ercast, too darkly frowned,\nThe soaking rain too constant streamed,\nAnd mists too dreary gathered round;",
            "She told me Summer's glorious ray\nWould chase those vapours all away,\nAnd scatter glories round,\nWith sweetest music fill the trees,\nLoad with rich scent the gentle breeze,\nAnd strew with flowers the ground. ",
            'But when, beneath that scorching ray,\nI languished, weary, through the day,\nWhile birds refused to sing,\nVerdure decayed from field and tree,\nAnd panting Nature mourned with me\nThe freshness of the Spring. ',
            '"Wait but a little while," she said,\n"Till Summer\'s burning days are fled;\nAnd Autumn shall restore,\nWith golden riches of her own,\nAnd Summer\'s glories mellowed down,\nThe freshness you deplore." ',
            'It has neither a beginning nor an end\nYou can never predict where it will bend.',
            'Life is a teacher, it will show you the way\nBut unless you live it...it will run away.',
            'If you have no fear of living, you will find\nNo fear of death and you will not mind.',
            'You have to feel the agonizing sorrow and feel the pain\nOnly then it will heal and you will be whole again.',
            'It is in every leaf, in your smile, in your tears\nIn your toil, in your triumphs and in your fears.',
            'Just enjoy the journey without looking back\nSavour the senses and you will not lack.',
            'Truth is more in the process than the result\nLiberates you from thought and you can exult',
            'To see the truth in the false, thats the key\nTo understand, without changing it...just let it be\nLove life, live it and it will set you free.... ',
            "He wakes, who never thought to wake again,\nWho held the end was Death. He opens eyes\nSlowly, to one long livid oozing plain\nClosed down by the strange eyeless heavens. He lies;\nAnd waits; and once in timeless sick surmise\nThrough the dead air heaves up an unknown hand,\nLike a dry branch. No life is in that land,\nHimself not lives, but is a thing that cries;\nAn unmeaning point upon the mud; a speck\nOf moveless horror; an Immortal One\nCleansed of the world, sentient and dead; a fly\nFast-stuck in grey sweat on a corpse's neck.",
            "I thought when love for you died, I should die.\nIt's dead. Alone, most strangely, I live on. ",
            'Being released from the womb of a woman\nWe walk towards another\nTo be chained again',
            'None is dependable\nNone is lovable ',
            'Life is a coplex road\nWalls, stones, mud, water…',
            'Life is a groaning running through\nGraveyards',
            "Life is too short to be spent\ngriping about the past,\nthings you don't have,\nplaces you haven't seen,\nthings you haven't done.",
            'Life is too short to be spent\nholding grievances against another,\nfinding fault in your brother,\ncounting the wrongs done on you.',
            "Life is just long enough\nto enjoy the beauty of a sunrise\nthe smell of wet earth\nand the sound of laughter\nafter a long day's work.",
            'Life is just long enough\nto practice compassion and generosity,\nto comfort the grieving,\nto lend strength to the fainthearted,\nand direction to the lost. ',
            'A student life is a golden life, truly it is told.\nThe student who has a golden crown of his study.\nWould be a great man in his life with a great hold.\nOtherwise an idle student becomes a wild rowdy.',
            'Every student should use his time in a proper way.\nIf he kills his time, surely time itself always kills him.\nsuch student repents in life every moment of a day.\nA studen must think that his time is a golden rim.',
            'To be a Dr.an engineer, a lawyer or a politician.\nTime makes him all for all to become more of thing.\nGolden time of a golden life even makes a muscian.\nOne who neglects this period of life becomes nothing.',
            'This period has a great charm to make also a magician.\nA life is in the hands of a student to make him something. ',
            "Some say the world will end in fire,\nSome say in ice.\nFrom what I've tasted of desire\nI hold with those who favor fire.\nBut if it had to perish twice,\nI think I know enough of hate\nTo say that for destruction ice\nIs also great\nAnd would suffice. ",
            'I left you in the morning,\nAnd in the morning glow,\nYou walked a way beside me\nTo make me sad to go.\nDo you know me in the gloaming,\nGaunt and dusty gray with roaming?\nAre you dumb because you know me not,\nOr dumb because you know? ',
            "All for me And not a question\nFor the faded flowers gay\nThat could take me from beside you\nFor the ages of a day?\nThey are yours, and be the measure\nOf their worth for you to treasure,\nThe measure of the little while\nThat I've been long away. ",
            "Here come real stars to fill the upper skies,\nAnd here on earth come emulating flies,\nThat though they never equal stars in size,\n(And they were never really stars at heart)\nAchieve at times a very star-like start.\nOnly, of course, they can't sustain the part. ",
            'I stand amid the roar\nOf a surf-tormented shore,\nAnd I hold within my hand\nGrains of the golden sand-\nHow few! yet how they creep\nThrough my fingers to the deep,\nWhile I weep- while I weep!\nO God! can I not grasp\nThem with a tighter clasp?\nO God! can I not save\nOne from the pitiless wave?\nIs all that we see or seem\nBut a dream within a dream? ',
            'If you die before me\nI would jump down into your grave\nand hug you so innocently\nthat angels will become jealous.',
            'I shall kiss you.\nSo intimately shall I kiss you\nthat your breath becomes mine.\nIn one breath of love shall\nwe merge into hugs of true joy.',
            'Your heart shall beat\nin rhythms unheard\nlike the drums of the desert\nand the wild forest in the night.',
            "You shall murmur in my ears:\n'Oh press me to your chest;\nTear open your chest,\nMake way for me\nto enter into your loving heart\nthat beats only for me in resounding colors.",
            'Tell me please Oh my lover\nIs it a rainbow that I see?\nor the glow of a burning pyre?\nWhy is it that I cannot utter it in words?\nTell me glorious angels of love:\nWhat am I experiencing in uncountable\nmoments of indescribable inner comfort? ',
            'Shivering in your presence\nI shall long to dance with you.',
            'If the dark souls lead you to Hades\nI will dance and dance with you\neven in the nether world.',
            'The most constant\nof all the characteristics\nof Illumination is the\nConsciousness\nof the Absolute.',
            'If you take away my breath\nI will fall down and perish.\nMy body shall return to dust.',
            "I feel God's life-giving breath\nwhen in springtime the nature\nwakes up from the winter sleep.",
            'I shall open my mouth\nand gladly praise my God.',
            'Wilt thou forgive that sin where I begun,\nWhich was my sin, though it were done before?\nWilt thou forgive that sin, through which I run,\nAnd do run still, though still I do deplore?\nWhen thou hast done, thou hast not done,\nFor I have more.',
            "Wilt thou forgive that sin which I have won\nOthers to sin, and made my sin their door?\nWilt thou forgive that sin which I did shun\nA year or two, but wallow'd in, a score?\nWhen thou hast done, thou hast not done,\nFor I have more.",
            'I have a sin of fear, that when I have spun\nMy last thread, I shall perish on the shore;\nBut swear by thyself, that at my death thy Son\nShall shine as he shines now, and heretofore;\nAnd, having done that, thou hast done;\nI fear no more.'
        ]
        self.phrases = [
            'Front-line leading edge website',
            'Upgradable upward-trending software',
            'Virtual tangible throughput', 'Robust secondary open system',
            'Devolved multimedia knowledge user',
            'Intuitive encompassing alliance',
            'Automated 3rd generation benchmark',
            'Switchable global info-mediaries', 'Automated 24/7 alliance',
            'Down-sized homogeneous software',
            'Balanced coherent internet solution',
            'Total intangible groupware',
            'Implemented zero defect Graphic Interface',
            'Programmable multi-tasking open system',
            'Extended non-volatile software',
            'Organized fresh-thinking initiative',
            'Public-key demand-driven product',
            'Visionary asymmetric utilisation',
            'Horizontal web-enabled structure',
            'Upgradable intangible paradigm',
            'Grass-roots background contingency',
            'User-centric homogeneous ability',
            'Face to face 5th generation analyzer',
            'Centralized maximized framework',
            'Future-proofed client-server internet solution',
            'Secured mission-critical benchmark',
            'Virtual zero defect throughput', 'Reduced incremental neural-net',
            'Intuitive real-time help-desk', 'Advanced client-server strategy',
            'Advanced secondary adapter',
            'Assimilated attitude-oriented hierarchy',
            'Innovative mobile project', 'Synergized tertiary emulation',
            'Innovative upward-trending framework',
            'Face to face multi-tasking utilisation',
            'Multi-layered maximized parallelism',
            'Versatile 6th generation utilisation',
            'Automated homogeneous pricing structure',
            'Ameliorated cohesive model',
            'Multi-channelled systemic process improvement',
            'Devolved upward-trending strategy',
            'Quality-focused secondary Graphical User Interface',
            'Diverse impactful focus group', 'Fundamental modular monitoring',
            'Cloned exuding hub', 'Secured clear-thinking matrix',
            'Digitized motivating superstructure',
            'Devolved foreground definition', 'Versatile explicit adapter',
            'Pre-emptive intermediate support',
            'Business-focused actuating interface',
            'Compatible empowering internet solution',
            'Customizable tangible neural-net', 'Networked stable methodology',
            'Networked transitional artificial intelligence',
            'Function-based secondary definition',
            'Horizontal 6th generation task-force',
            'Diverse 3rd generation customer loyalty',
            'Organic mobile structure', 'User-friendly empowering complexity',
            'Versatile stable frame',
            'Synchronised directional superstructure',
            'Enhanced logistical protocol',
            'Persistent empowering open architecture',
            'Profit-focused optimal contingency',
            'User-friendly background migration',
            'Re-engineered directional array',
            'Automated upward-trending knowledge base',
            'Automated tangible attitude', 'Multi-channelled mobile core',
            'Implemented real-time initiative', 'Managed homogeneous concept',
            'Integrated attitude-oriented model'
        ]

        self.df1 = pd.DataFrame(self.poems, columns=['text'])
        self.df2 = pd.DataFrame(self.phrases, columns=['text'])
        self.store = Store(self.df1, self.df2)