def test_no_overwrite(self): # kwargs = {'overwrite': False} kwargs = {} task = ImportStudentCourseEnrollmentTask(**kwargs) with patch('edx.analytics.tasks.database_imports.HivePartitionTarget') as mock_target: output = mock_target() # Make MagicMock act more like a regular mock, so that flatten() does the right thing. del output.__iter__ del output.__getitem__ output.exists = Mock(return_value=False) self.assertFalse(task.complete()) self.assertTrue(output.exists.called) output.exists = Mock(return_value=True) self.assertTrue(task.complete()) self.assertTrue(output.exists.called)
def test_no_overwrite(self): # kwargs = {'overwrite': False} kwargs = {} task = ImportStudentCourseEnrollmentTask(**kwargs) with patch('edx.analytics.tasks.database_imports.HivePartitionTarget' ) as mock_target: output = mock_target() # Make MagicMock act more like a regular mock, so that flatten() does the right thing. del output.__iter__ del output.__getitem__ output.exists = Mock(return_value=False) self.assertFalse(task.complete()) self.assertTrue(output.exists.called) output.exists = Mock(return_value=True) self.assertTrue(task.complete()) self.assertTrue(output.exists.called)
def requires(self): # Note that import parameters not included are 'destination', 'num_mappers', 'verbose', # and 'date' -- we will use the default values for those. kwargs_for_db_import = { 'overwrite': self.overwrite, } yield ( ImportLastCountryOfUserToHiveTask( mapreduce_engine=self.mapreduce_engine, n_reduce_tasks=self.n_reduce_tasks, source=self.source, interval=self.interval, pattern=self.pattern, geolocation_data=self.geolocation_data, overwrite=self.overwrite, user_country_output=self.user_country_output, ), InsertToMysqlLastCountryOfUserTask( mapreduce_engine=self.mapreduce_engine, n_reduce_tasks=self.n_reduce_tasks, source=self.source, interval=self.interval, pattern=self.pattern, geolocation_data=self.geolocation_data, overwrite=self.overwrite, user_country_output=self.user_country_output, ), # We can't make explicit dependencies on this yet, until we # solve the multiple-credentials problem, as well as the split-kwargs # problem. ImportStudentCourseEnrollmentTask(**kwargs_for_db_import), ImportAuthUserTask(**kwargs_for_db_import), )
def test_query_with_date(self): kwargs = {'import_date': datetime.datetime.strptime('2014-07-01', '%Y-%m-%d').date()} task = ImportStudentCourseEnrollmentTask(**kwargs) query = task.query() expected_query = textwrap.dedent( """ USE default; DROP TABLE IF EXISTS student_courseenrollment; CREATE EXTERNAL TABLE student_courseenrollment ( id INT,user_id INT,course_id STRING,created TIMESTAMP,is_active BOOLEAN,mode STRING ) PARTITIONED BY (dt STRING) LOCATION 's3://foo/bar/student_courseenrollment'; ALTER TABLE student_courseenrollment ADD PARTITION (dt = '2014-07-01'); """ ) self.assertEquals(query, expected_query)
def test_query_with_date(self): kwargs = { 'import_date': datetime.datetime.strptime('2014-07-01', '%Y-%m-%d').date() } task = ImportStudentCourseEnrollmentTask(**kwargs) query = task.query() expected_query = textwrap.dedent(""" USE default; DROP TABLE IF EXISTS student_courseenrollment; CREATE EXTERNAL TABLE student_courseenrollment ( id INT,user_id INT,course_id STRING,created TIMESTAMP,is_active BOOLEAN,mode STRING ) PARTITIONED BY (dt STRING) LOCATION 's3://foo/bar/student_courseenrollment'; ALTER TABLE student_courseenrollment ADD PARTITION (dt = '2014-07-01'); """) self.assertEquals(query, expected_query)
def requires(self): yield ( ImportCourseModeTask( import_date=self.import_date ), ImportStudentCourseEnrollmentTask( import_date=self.import_date ), ReconciledOrderTransactionTableTask( import_date=self.import_date, n_reduce_tasks=self.n_reduce_tasks ) )
def requires(self): # Note that import parameters not included are 'destination', 'num_mappers', 'verbose', # and 'date' -- we will use the default values for those. kwargs_for_db_import = { 'overwrite': self.overwrite, } yield ( LastCountryOfUserPartitionTask( mapreduce_engine=self.mapreduce_engine, n_reduce_tasks=self.n_reduce_tasks, source=self.source, pattern=self.pattern, warehouse_path=self.warehouse_path, interval=self.interval, interval_start=self.interval_start, interval_end=self.interval_end, overwrite_n_days=self.overwrite_n_days, geolocation_data=self.geolocation_data, overwrite=self.overwrite, ), ImportStudentCourseEnrollmentTask(**kwargs_for_db_import), ImportAuthUserTask(**kwargs_for_db_import), )
def test_overwrite(self): kwargs = {'overwrite': True} task = ImportStudentCourseEnrollmentTask(**kwargs) self.assertFalse(task.complete())
def requires_hadoop(self): # Instead of just pointing to the output directory of a dump, let's make sure # there is a dump. # We don't have a way to just dump the Mysql table, so deal with the Hive table # definition as well. yield ImportStudentCourseEnrollmentTask(credentials=self.credentials)
def requires(self): return ImportStudentCourseEnrollmentTask(import_date=self.run_date, destination=self.warehouse_path)