def test_scrape_bookings(self, output_mock, all_htmls_mock, input_mock, ensure_foreign_key_mock, new_mail_mock): test_data = pd.read_csv( 'tests/test_data/gomus/scrape_bookings_data.csv') # We generate random stuff for extracted_bookings, # because the scraper doesn't need it for any calculations. # This resembles existing booking-data # that should not get manipulated by the scraper. test_data.insert(1, 'some_other_value', test_data.index) extracted_bookings = test_data.filter( ['booking_id', 'some_other_value']) input_target = MockTarget('extracted_bookings_out', format=UTF8) input_mock.return_value = input_target with input_target.open('w') as input_file: extracted_bookings.to_csv(input_file, index=False) # download htmls (similar to what FetchBookingsHTML does) html_file_names = [] for i, row in extracted_bookings.iterrows(): booking_id = row['booking_id'] new_html_task = FetchGomusHTML(url=f"/admin/bookings/{booking_id}") new_html_task.run() html_file_names.append(new_html_task.output().path) all_htmls_target = MockTarget('bookings_htmls_out', format=UTF8) all_htmls_mock.return_value = all_htmls_target with all_htmls_target.open('w') as all_htmls_file: all_htmls_file.write('\n'.join(html_file_names)) output_target = MockTarget('enhanced_bookings_out', format=UTF8) output_mock.return_value = output_target # Also test that fetch_updated_mail would be called for non-empty # invalid_values (actual functionality tested elsewhere) invalid_values = pd.DataFrame([0], columns=['booking_id']) def mocked_ensure_foreign_key(df, invalid_handler): invalid_handler(invalid_values, None) return df ensure_foreign_key_mock.side_effect = mocked_ensure_foreign_key new_mail_mock.return_value = iter( [FetchGomusHTML(url='test1'), FetchGomusHTML(url='test2')]) # -- execute code under test -- self.task = EnhanceBookingsWithScraper(columns=BOOKING_COLUMNS) run = self.task.run() for yielded_task in run: self.assertIsInstance(yielded_task, FetchGomusHTML) # -- inspect results -- expected_output = test_data.filter( ['booking_id', 'some_other_value', 'expected_hash']) with output_target.open('r') as output_file: actual_output = pd.read_csv(output_file) self.assertEqual(len(expected_output.index), len(actual_output.index)) for i in range(len(actual_output)): expected_row = expected_output.iloc[i] actual_row = actual_output.iloc[i] # test if order stayed the same self.assertEqual(expected_row['booking_id'], actual_row['booking_id']) # test if existing data got modified self.assertEqual(expected_row['some_other_value'], actual_row['some_other_value']) # test if scraped data is correct hash_str = ','.join([ str(actual_row['customer_id']), str(actual_row['order_date']), str(actual_row['language']) ]) actual_hash = mmh3.hash(hash_str, seed=self.hash_seed) self.assertEqual(actual_hash, expected_row['expected_hash'], msg=f"Scraper got wrong values:\n\ {str(actual_row) if sys.stdin.isatty() else 'REDACTED ON NON-TTY'}")
def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs): full_path = (hostname or '') + path query.update(kwargs) return MockTarget(full_path, **query)
def output(self): return MockTarget(self.p)
def output(self): return MockTarget("IngestData", mirror_on_stderr=True)
def output(self): return MockTarget(self.date.strftime('/tmp/copy-data-%Y-%m-%d.txt'))
def output(self): return MockTarget( self.d.strftime('/n2000y01a05n/%Y_%m-_-%daww/21mm01dara21/ooo'))
def output(self): return MockTarget(self.dh.strftime('TaskA/%Y-%m-%d/%H'))
def output(self): return MockTarget('output')
def test_multi_csv_download(self, mock_open): qsf = TestQuerySalesforce() qsf.merge_batch_results(self.result_ids) self.assertEqual( MockTarget(qsf.output().path).open('r').read(), self.all_lines)
def output(self): return MockTarget(self.date.strftime('/tmp/b-%Y-%m-%d.xml'))
def input(self): return MockTarget('input')
def output(self): return MockTarget('/tmp/a.xml')
def output(self): return MockTarget('/tmp/a.txt')
def output(self): return MockTarget("BaseTask", mirror_on_stderr=True)
def output(self): return MockTarget( self.dh.strftime('TaskB/%%s%Y-%m-%d/%H') % self.complicator)
def mocked_open(*args, **kwargs): if re.match("job_data", args[0]): return MockTarget(args[0]).open(args[1]) else: return old__open(*args)
def output(self): return MockTarget(self.dh.strftime('not/a/real/path/%Y-%m-%d/%H'))
def output(self): return MockTarget('job_data.csv')
def output(self): return MockTarget( self.d.strftime( '/data/2014/p/v/z/%Y_/_%m-_-%doctor/20/ZOOO'))
def output(self): # 学習は何度も繰り返せるようにMockのoutputを返す return MockTarget("output")
def output(self): return MockTarget("CleanData", mirror_on_stderr=True)
def output(self): return MockTarget('dummy.txt')
def output(self): return MockTarget('/%s/%u' % (self.__class__.__name__, self.param))
def output(self): return MockTarget('words.txt')
def output(self): return MockTarget('/tmp/test_%d' % self.n)
class TestInfo(unittest.TestCase): def setUp(self) -> None: MockFileSystem().clear() @patch('luigi.LocalTarget', new=lambda path, **kwargs: MockTarget(path, **kwargs)) def test_make_tree_info_pending(self): task = _Task(param=1, sub=_SubTask(param=2)) # check before running tree = gokart.info.make_tree_info(task) expected = r""" └─-\(PENDING\) _Task\[[a-z0-9]*\] └─-\(PENDING\) _SubTask\[[a-z0-9]*\]""" self.assertRegex(tree, expected) @patch('luigi.LocalTarget', new=lambda path, **kwargs: MockTarget(path, **kwargs)) def test_make_tree_info_complete(self): task = _Task(param=1, sub=_SubTask(param=2)) # check after sub task runs luigi.build([task], local_scheduler=True) tree = gokart.info.make_tree_info(task) expected = r""" └─-\(COMPLETE\) _Task\[[a-z0-9]*\] └─-\(COMPLETE\) _SubTask\[[a-z0-9]*\]""" self.assertRegex(tree, expected) @patch('luigi.LocalTarget', new=lambda path, **kwargs: MockTarget(path, **kwargs)) def test_make_tree_info_abbreviation(self): task = _DoubleLoadSubTask( sub1=_Task(param=1, sub=_SubTask(param=2)), sub2=_Task(param=1, sub=_SubTask(param=2)), ) # check after sub task runs luigi.build([task], local_scheduler=True) tree = gokart.info.make_tree_info(task) expected = r""" └─-\(COMPLETE\) _DoubleLoadSubTask\[[a-z0-9]*\] |--\(COMPLETE\) _Task\[[a-z0-9]*\] | └─-\(COMPLETE\) _SubTask\[[a-z0-9]*\] └─-\(COMPLETE\) _Task\[[a-z0-9]*\] └─- \.\.\.""" self.assertRegex(tree, expected) @patch('luigi.LocalTarget', new=lambda path, **kwargs: MockTarget(path, **kwargs)) def test_make_tree_info_not_compress(self): task = _DoubleLoadSubTask( sub1=_Task(param=1, sub=_SubTask(param=2)), sub2=_Task(param=1, sub=_SubTask(param=2)), ) # check after sub task runs luigi.build([task], local_scheduler=True) tree = gokart.info.make_tree_info(task, abbr=False) expected = r""" └─-\(COMPLETE\) _DoubleLoadSubTask\[[a-z0-9]*\] |--\(COMPLETE\) _Task\[[a-z0-9]*\] | └─-\(COMPLETE\) _SubTask\[[a-z0-9]*\] └─-\(COMPLETE\) _Task\[[a-z0-9]*\] └─-\(COMPLETE\) _SubTask\[[a-z0-9]*\]""" self.assertRegex(tree, expected)
def output(self): return MockTarget("mock")
def output(self): return MockTarget('banana-dep-%s-%s' % (self.x, self.y))
def output(self): return MockTarget('.'.join(map(str, self.p)))
def _touch(self, path): t = MockTarget(path) with t.open('w'): pass