class StopWordScrubberTest(unittest.TestCase): def setUp(self): data = { 'test': [['this', 'sentence', 'has', 'multiple', 'stopwords'], ['this', 'sentence', 'one', 'multiple', 'too'], ['verb', 'noun'], ['too', 'than', 'can']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_scrubbing_new_column(self): scrubber = StopWordScrubber(column='test', new_column='test2') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test2']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row) self.assertEqual(5, len(df['test'][0])) self.assertEqual(5, len(df['test'][1])) self.assertEqual(2, len(df['test'][2])) self.assertEqual(3, len(df['test'][3])) def test_scrubbing(self): scrubber = StopWordScrubber(column='test') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row) def test_verbose_scrubbing(self): scrubber = StopWordScrubber(column='test', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row)
class testHTMLScrubber(unittest.TestCase): def setUp(self): data = { 'text': [ '<p>this is some text</p>', '<h1 id="bla", class="blabla", style="transform: translatyeY(-50%)">this is some more text</h1>', '<p>and even <b>more</b> text, damn</p>', 'this is my text (dont remove this)', "this is some text, $('span#TrackingJobBody a').each(function (i, v) { if ($(v).attr('href')) { var href = $(v).attr('href').toLowerCase(); if (href.match(\"^http\")) { switch (true) { case /facebook/.test(href): $(v).attr('mns_rt', 'NonJob-Facebook'); break; case /linkedin/.test(href): $(v).attr('mns_rt', 'NonJob-Linkedin'); break; case /twitter\.com/.test(href): $(v).attr('mns_rt', 'NonJob-Twitter'); break; case /plus\.google\.com/.test(href): $(v).attr('mns_rt', 'NonJob-GooglePlus'); break; case /youtube/.test(href): $(v).attr('data-track', 'Client-Social-Youtube'); break; case /http[s]?\:\/\/([a-z0-9\-\.]{1,}\.[a-z]{2,})[\/]?$/.test(href): $(v).attr('data-track', 'Client-Link-Homepage'); break; default: $(v).attr('mns_rt', 'jobcustomapplyonline'); break; } } } });" ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def testHTMLRemoval(self): scrubber = CodeScrubber('text') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def validate_string(self, text): print(text) self.assertFalse('>' in text) self.assertFalse('<' in text) self.assertFalse('{' in text) self.assertFalse('bla' in text) self.assertFalse('transform' in text) self.assertFalse('50%' in text) self.assertTrue('dont remove this' in text) def testHTMLRemoval_verbose(self): scrubber = CodeScrubber('text', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def testHTMLRemoval_new_col(self): scrubber = CodeScrubber(text_column='text', new_text_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['new']: self.validate_string(text)
class testPunctuationScrubber(unittest.TestCase): def setUp(self): data = { 'text': [ 'This is, some text.', 'Is this, (some) text!?', 'Would you like: ham, spam and eggs; spam, ham and eggs or eggs, ham and spam?' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_punc_removal(self): scrubber = PunctuationScrubber('text') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def validate_string(self, text): self.assertFalse(',' in text) self.assertFalse('.' in text) self.assertFalse(';' in text) self.assertFalse(':' in text) self.assertFalse('!' in text) self.assertFalse('?' in text) self.assertFalse('(' in text) self.assertFalse(')' in text) def test_punc_removal_verbose(self): scrubber = PunctuationScrubber('text', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def test_punc_removal_new_col(self): scrubber = PunctuationScrubber(text_column='text', new_text_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['new']: self.validate_string(text)
class WordStemmerTest(unittest.TestCase): def setUp(self): data = { 'test': [['is', 'this', 'a', 'stemmable', 'sentence'], ['cats', 'are', 'smarter', 'than', 'dogs']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_scrubbing(self): scrubber = WordStemmer(column='test') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['test'][1], ['cat', 'are', 'smarter', 'than', 'dog']) def test_scrubbing_verbose(self): scrubber = WordStemmer(column='test', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['test'][1], ['cat', 'are', 'smarter', 'than', 'dog']) def test_scrubbing_new_col(self): scrubber = WordStemmer(column='test', new_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['new'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['new'][1], ['cat', 'are', 'smarter', 'than', 'dog']) self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmable', 'sentence']) self.assertEqual(df['test'][1], ['cats', 'are', 'smarter', 'than', 'dogs'])
def separate_by_target_categories(data_model: DataModel, target_column_name): df = data_model.get_dataframe() categories = df[target_column_name].unique() stack_one = df.loc[df[target_column_name] == categories[0]] stack_two = df.loc[df[target_column_name] == categories[1]] return stack_one, stack_two
class TestAverageColumnScrubber(unittest.TestCase): def setUp(self): self._data = { 'column_1': [0, 2, 3], 'column_2': [3, 2, 1], 'column_3': [3, 2, 2], } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) self._data_model.metadata.define_numerical_columns( ['column_1', 'column_2', 'column_3']) def test_invalid_column(self): input_columns = ('column_1', 'column_2', 'invalid_1') average_column = 'average' average_converter = AverageColumnScrubber(input_columns, average_column) with self.assertRaises(RuntimeError): average_converter.validate(self._data_model) def test_valid_metadata(self): input_columns = ('column_1', 'column_2', 'column_3') average_column = 'average' average_converter = AverageColumnScrubber(input_columns, average_column) self.assertIsNone( average_converter.validate_metadata(self._data_model.metadata)) def test_invalid_metadata(self): input_columns = ('column_1', 'column_2', 'column_3') average_column = 'average' self._data_model.metadata.define_categorical_columns(['column_3']) average_converter = AverageColumnScrubber(input_columns, average_column) with self.assertRaises(RuntimeError): average_converter.validate_metadata(self._data_model.metadata) def test_averaging_3_columns(self): input_columns = ('column_1', 'column_2', 'column_3') average_column = 'average' average_converter = AverageColumnScrubber(input_columns, average_column) average_converter.validate(self._data_model) average_converter.scrub(self._data_model) self.assertListEqual( [2, 2, 2], self._data_model.get_dataframe()[average_column].tolist())
class testLowerTextScrubber(unittest.TestCase): def setUp(self): data = { 'text': [ 'This is some text.', 'Get some text ASAP?', 'This is some text for John and Joan' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_lower(self): scrubber = LowerTextScrubber('text') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def validate_string(self, text: str): self.assertEqual(text, text.lower()) def test_lower_verbose(self): scrubber = LowerTextScrubber('text', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def test_lower_new_col(self): scrubber = LowerTextScrubber(text_column='text', new_text_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['new']: self.validate_string(text)
class TestMultipleCatListToMultipleHot(unittest.TestCase): def testScrubbing(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'list_1': [['EUR', 'USD'], ['USD', 'JPY', 'AUD'], ['EUR'], ['EUR', 'GBP', 'AUD'], ['USD'], ['EUR', 'JPY'], ['EUR', 'GBP'], ['USD', 'JPY'], ['EUR', 'GBP'], ['USD']], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['list_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata scrubber = MultipleCatListToMultipleHotScrubber(col_name='list_1') scrubber.validate(self.data_model) scrubber.scrub(self.data_model) new_df = self.data_model.get_dataframe() columns = list(new_df.columns.values) # test new columns self.assertEqual(len(columns), 7) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns) # check column contents has_EUR_series = new_df['list_1_EUR'] self.assertEqual(list(has_EUR_series.to_dict().values()), [1, 0, 1, 1, 0, 1, 1, 0, 1, 0]) # test metadata meta_data_categorical_cols = self.data_model.metadata.binary_columns self.assertEqual(len(meta_data_categorical_cols), 5) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns)
def balance_data(self, data_model: DataModel, target_column_name: str) -> DataModel: weights_by_label = self.map_weights_by_cat_label( data_model, target_column_name) df = data_model.get_dataframe() weights = self.get_weights_list(df, target_column_name, weights_by_label) weights_column_warning = 'note: adding weights column ({}), make sure it is passed to the estimator- and ' \ 'data builder!'.format(WEIGHTS_COLUMN) warnings.warn(weights_column_warning) df[WEIGHTS_COLUMN] = weights data_model.set_dataframe(df) return data_model
class TestMissingDataReplacer(unittest.TestCase): def setUp(self): self._data = { 'numerical_1': [2, None, 3, 4], 'numerical_3': [None, 2, 3, 4], 'categorical_1': ['one', None, 'two', 'three'], 'categorical_2': ['apple', 'pie', None, 'three'], 'categorical_3': ['apple', 'pie', None, 'three'], 'unknown_1': [9, 10, 11, 12] } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) def test_categorize_columns(self): categorical = ['categorical_1', 'categorical_2', 'categorical_3'] self._data_model.metadata.define_categorical_columns(categorical) numerical = ['numerical_1', 'numerical_3'] self._data_model.metadata.define_numerical_columns(numerical) unknown_category = 'unknown' missing_data_scrubber1 = MissingDataReplacer( missing_category_name=unknown_category, missing_numerical_value='average', scrub_columns=['categorical_1', 'categorical_2', 'numerical_1']) missing_data_scrubber1.validate(self._data_model) missing_data_scrubber1.scrub(self._data_model) missing_data_scrubber2 = MissingDataReplacer( missing_numerical_value=1, scrub_columns=['numerical_3']) missing_data_scrubber2.validate(self._data_model) missing_data_scrubber2.scrub(self._data_model) self.assertEqual(unknown_category, self._data_model.get_dataframe()['categorical_1'][1]) self.assertEqual(unknown_category, self._data_model.get_dataframe()['categorical_2'][2]) self.assertEqual(None, self._data_model.get_dataframe()['categorical_3'][2]) self.assertEqual(2, self._data_model.get_dataframe()['numerical_1'][0]) self.assertEqual(3, self._data_model.get_dataframe()['numerical_1'][1]) self.assertEqual(1, self._data_model.get_dataframe()['numerical_3'][0]) self.assertEqual(2, self._data_model.get_dataframe()['numerical_3'][1]) self.assertEqual(9, self._data_model.get_dataframe()['unknown_1'][0])
class TestDateScrubber(unittest.TestCase): def setUp(self): self._data = { 'date': [ '2017-03-26T05:04:46.539Z', '2017-12-01T23:04:46.539Z', '2017-02-08T07:38:48.129Z' ], } self._dataframe = pd.DataFrame(data=self._data) self.data_model = DataModel(self._dataframe) def test_validate(self): data_scrubber = StringToDateScrubber(date_columns={'date': '%Y-%m-%d'}) data_scrubber.validate(self.data_model) def test_scrub(self): data_scrubber = StringToDateScrubber(date_columns={'date': '%Y-%m-%d'}) data_scrubber.scrub(self.data_model) date = datetime.strptime('2017-12-01', '%Y-%m-%d') self.assertEqual(date, self.data_model.get_dataframe()['date'][1])
class TestMultipleCatToListColumnScrubber(unittest.TestCase): def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'mh_1': [ 'EUR,USD', 'USD,JPY,AUD', 'EUR', 'EUR,GBP,AUD', 'USD', 'EUR,JPY', 'EUR,GBP', 'USD,JPY', 'EUR,GBP', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['mh_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def testScrubbing(self): scrubber = MultipleCatToListScrubber(sepperator=',') scrubber.scrub(self.data_model) first_item = self.data_model.get_dataframe()['mh_1'][0] self.assertIsInstance(first_item, list)
def randomize_data(data: DataModel, seed: int): df = data.get_dataframe() df = df.sample(frac=1, random_state=seed) data.set_dataframe(df)
def validate_data_frame(self, data_frame: DataModel, columns: list): self.assertEqual(data_frame.target_column_name, 'target_1') self.assertCountEqual(data_frame.get_dataframe().columns.tolist(), columns, )
class TestOutlierScrubbing(unittest.TestCase): def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'num_2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'cat_1': [ 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1', 'num_2']) metadata.define_categorical_columns(['cat_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def testOutlierScrubberCreate1(self): OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}) def testOutlierScrubberCreate2(self): OutlierScrubber(all_z=2) def testOutlierScrubberCreate3(self): with self.assertRaises(AssertionError): OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}, all_z=2) def testOutlierScrubberCreate4(self): with self.assertRaises(AssertionError): OutlierScrubber() def testOutlierScrubberConfig(self): scrubber = OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}) config = scrubber.scrubber_config_list self.assertEqual(config['num_1'], MetaData.NUMERICAL_DATA_TYPE) self.assertEqual(config['num_2'], MetaData.NUMERICAL_DATA_TYPE) def testOutlierValidateMetaData(self): scrubber = OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}) scrubber.validate_metadata(self.data_model.metadata) def testOutlierValidateMetaDataFalse(self): scrubber = OutlierScrubber(col_z={'num_1': 2, 'num_3': 3}) with self.assertRaises(RuntimeError): scrubber.validate_metadata(self.data_model.metadata) def testOutlierValidate(self): scrubber = OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}) scrubber.validate(self.data_model) def testOutlierValidateFalse(self): scrubber = OutlierScrubber(col_z={'num_1': 2, 'num_3': 3}) with self.assertRaises(AssertionError): scrubber.validate(self.data_model) def setOutlierScrubber1(self): with self.assertRaises(AssertionError): outlier_scrubber = OutlierScrubber(col_z={'num_1': 2, 'cat_1': 3}) outlier_scrubber.scrub(self.data_model) def testOutlierScrubber2(self): outlierScrubber = OutlierScrubber(col_z={'num_1': 2, 'num_2': 3}) outlierScrubber.scrub(self.data_model) self.assertEqual(len(self.data_model.get_dataframe()), 9) def testOutlierScrubber3a(self): outlierScrubber = OutlierScrubber(all_z=2) outlierScrubber.scrub(self.data_model) self.assertEqual(len(self.data_model.get_dataframe()), 9) def testOutlierScrubber3b(self): outlierScrubber = OutlierScrubber(all_z=1) outlierScrubber.scrub(self.data_model) self.assertEqual(len(self.data_model.get_dataframe()), 5)
class TestAndScrubber(unittest.TestCase): def setUp(self): date_1 = datetime.strptime('01-01-2018', '%d-%m-%Y') date_2 = datetime.strptime('01-01-2017', '%d-%m-%Y') date_3 = datetime.now() self._data = { 'value_1': [0, 1, 50], 'value_2': [0, 3, 150], 'currency': ['EUR', 'USD', 'EUR'], 'date': [date_3, date_1, date_2], } self._df = pd.DataFrame(self._data) self.data_model = DataModel(self._df) self.data_model.metadata.define_numerical_columns( ['value_1', 'value_2']) self.data_model.metadata.define_categorical_columns(['currency']) def test_multiple_validation(self): average_column_scrubber = AverageColumnScrubber(('value_1', 'value_2'), 'value') convert_currency_scrubber = ConvertCurrencyScrubber( 'value', 'new_value', 'currency', 'USD', 'date') self.and_scrubber = AndScrubber(average_column_scrubber, convert_currency_scrubber) self.and_scrubber.validate_metadata(self.data_model.metadata) def test_first_invalid(self): average_column_scrubber = AverageColumnScrubber( ('value_1', 'value_2, value_3'), 'value') convert_currency_scrubber = ConvertCurrencyScrubber( 'value', 'currency', 'USD', 'date') self.and_scrubber = AndScrubber(average_column_scrubber, convert_currency_scrubber) with self.assertRaises(RuntimeError): self.and_scrubber.validate_metadata(self.data_model.metadata) def test_second_invalid(self): average_column_scrubber = AverageColumnScrubber(('value_1', 'value_2'), 'value') convert_currency_scrubber = ConvertCurrencyScrubber( 'value', 'invalid', 'USD', 'date') self.and_scrubber = AndScrubber(average_column_scrubber, convert_currency_scrubber) with self.assertRaises(RuntimeError): self.and_scrubber.validate_metadata(self.data_model.metadata) def test_multiple_scrubbing(self): average_column_scrubber = AverageColumnScrubber(('value_1', 'value_2'), 'value') convert_currency_scrubber = ConvertCurrencyScrubber( 'value', 'new_value', 'currency', 'USD', 'date') self.and_scrubber = AndScrubber(average_column_scrubber, convert_currency_scrubber) self.and_scrubber.scrub(self.data_model) result = self.data_model.get_dataframe()['new_value'] self.assertEqual(0.0, result[0]) self.assertEqual(2.0, result[1]) self.assertEqual(105, round(result[2]))
class TestConvertCurrencyScrubber(unittest.TestCase): def setUp(self): date_1 = datetime.strptime('01-01-2018', '%d-%m-%Y') date_2 = datetime.strptime('01-01-2017', '%d-%m-%Y') date_3 = datetime.now() self._data = { 'value': [0, 2, 100], 'currency': ['EUR', 'USD', 'EUR'], 'date': [date_3, date_1, date_2], } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) def test_valid_columns(self): currency_converter = ConvertCurrencyScrubber('value', 'currency', 'USD', 'date') with self.assertRaises(RuntimeError): currency_converter.validate(self._data_model) def test_valid_metadata(self): self._data_model.metadata.define_numerical_columns(['value']) self._data_model.metadata.define_categorical_columns(['currency']) currency_converter = ConvertCurrencyScrubber('value', 'value', 'currency', 'USD', 'date') currency_converter.validate_metadata(self._data_model.metadata) def test_valid_columns_without_date(self): currency_converter = ConvertCurrencyScrubber('value', 'value', 'currency', 'USD') currency_converter.validate(self._data_model) def test_invalid_column(self): currency_converter = ConvertCurrencyScrubber('value', 'invalid', 'USD', 'date') with self.assertRaises(RuntimeError): currency_converter.validate(self._data_model) def test_invalid_metadata(self): self._data_model.metadata.define_numerical_columns( ['value', 'currency']) currency_converter = ConvertCurrencyScrubber('value', 'currency', 'USD', 'date') with self.assertRaises(RuntimeError): currency_converter.validate_metadata(self._data_model.metadata) def test_invalid_date_column(self): currency_converter = ConvertCurrencyScrubber('value', 'invalid', 'USD', 'invalid_date') with self.assertRaises(RuntimeError): self.assertEqual(False, currency_converter.validate(self._data_model)) def test_converting_with_date_column(self): currency_converter = ConvertCurrencyScrubber('value', 'value', 'currency', 'USD', 'date') currency_converter.scrub(self._data_model) df = self._data_model.get_dataframe() result = df['value'] self.assertEqual(0.0, result[0]) self.assertEqual(2.0, result[1]) self.assertEqual(105, round(result[2])) def test_converting_without_date_column(self): currency_converter = ConvertCurrencyScrubber('value', 'value', 'currency', 'USD') currency_converter.scrub(self._data_model) df = self._data_model.get_dataframe() result = df['value'] self.assertEqual(0.0, result[0]) self.assertEqual(2.0, result[1]) self.assertEqual(116, round(result[2]))