def test_location_by_course(self):
        self.upload_tracking_log(self.INPUT_FILE, self.START_DATE)

        for fixture_file_name in self.SQL_FIXTURES:
            self.execute_sql_fixture_file(fixture_file_name)

        self.task.launch([
            'InsertToMysqlCourseEnrollByCountryWorkflow',
            '--source', self.test_src,
            '--interval', self.DATE_INTERVAL.to_string(),
            '--user-country-output', url_path_join(self.test_out, 'user'),
            '--course-country-output', url_path_join(self.test_out, 'country'),
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
        ])

        with self.export_db.cursor() as cursor:
            cursor.execute('SELECT * FROM course_enrollment_location_current ORDER BY country_code')
            results = cursor.fetchall()

        self.maxDiff = None
        # TODO: what happens if the test starts near the UTC day boundary. The task sees that today is day "X", yet this
        # code sees the following day since the day boundary was crossed between then and now.
        today = datetime.utcnow().date()

        self.assertItemsEqual([
            row[1:5] for row in results
        ], [
            (today, self.COURSE_ID, '', 1),
            (today, self.COURSE_ID, 'IE', 1),
            (today, self.COURSE_ID, 'TH', 1),
            (today, self.COURSE_ID2, 'TH', 1),
        ])
    def test_enrollment_trends(self):
        self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 8, 1))

        blacklist_path = url_path_join(self.test_src, 'blacklist')
        blacklist_date = '2014-08-29'
        blacklist_url = url_path_join(blacklist_path, 'dt=' + blacklist_date, 'blacklist.tsv')
        with S3Target(blacklist_url).open('w') as f:
            f.write('edX/Open_DemoX/edx_demo_course3')

        config_override = {
            'enrollments': {
                'blacklist_date': blacklist_date,
                'blacklist_path': blacklist_path,
            }
        }

        self.task.launch([
            'ImportCourseDailyFactsIntoMysql',
            '--credentials', self.export_db.credentials_file_url,
            '--src', self.test_src,
            '--dest', self.test_out,
            '--name', 'test',
            '--include', '"*"',
            '--run-date', '2014-08-06',
            '--manifest', url_path_join(self.test_root, 'manifest.txt'),
            '--lib-jar', self.oddjob_jar,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
        ], config_override=config_override)

        self.validate_output()
 def setUp(self):
     super(DeidentificationAcceptanceTest, self).setUp()
     self.temporary_dir = tempfile.mkdtemp()
     self.addCleanup(shutil.rmtree, self.temporary_dir)
     self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw')
     self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID)
     self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
    def test_enrollment_validation(self):
        # Initial setup.
        self.upload_tracking_log(self.INPUT_FILE, self.START_DATE)
        self.execute_sql_fixture_file(self.SQL_FIXTURE)
        self.test_validate = url_path_join(self.test_root, 'validate')

        # Run once.  This will generate the new validation events, but
        # will not include them in the validation run (because the
        # requirements for the validation run are computed before any
        # validation events are generated).
        self.test_first_run = url_path_join(self.test_out, 'first_run')
        self.launch_task(self.test_first_run, run_with_validation_events=False)

        # Check that validation took place.
        self.check_validation_events()

        # Run again, with the validation events generated by the first run.
        self.test_second_run = url_path_join(self.test_out, 'second_run')
        self.launch_task(self.test_second_run)

        # Check that synthetic events were created.
        self.check_synthetic_events(self.test_second_run)

        # Run again, with the synthetic events generated by the second run.
        self.test_third_run = url_path_join(self.test_out, 'third_run')
        self.launch_task(self.test_third_run, extra_source=self.test_second_run)

        # Check that no events are output.
        self.check_no_synthetic_events(self.test_third_run)
 def _get_required_tasks(self):
     """Internal method to actually calculate required tasks once."""
     start_date = self.interval.date_a
     end_date = self.interval.date_b
     table_name = "student_courseenrollment"
     source_root = url_path_join(self.warehouse_path, table_name)
     today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d')
     current_date = start_date
     while current_date <= end_date:
         datestring = current_date.strftime('%Y-%m-%d')
         current_date += datetime.timedelta(days=1)
         src_datestring = "dt={}".format(datestring)
         source_dir = url_path_join(source_root, src_datestring)
         target = get_target_from_url(source_dir)
         output_dir = url_path_join(self.output_root, datestring)
         if datestring == today_datestring:
             yield CreateEnrollmentValidationEventsForTodayTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
                 credentials=self.credentials,
             )
         elif target.exists():
             yield CreateEnrollmentValidationEventsTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
             )
 def setUp(self):
     super(ObfuscationAcceptanceTest, self).setUp()
     self.temporary_dir = tempfile.mkdtemp()
     self.addCleanup(shutil.rmtree, self.temporary_dir)
     self.dump_root = url_path_join(self.test_src, "course_exports", "raw")
     self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID)
     self.test_gpg_key_dir = url_path_join(self.test_root, "gpg-keys")
    def check_validation_events(self):
        """Confirm that validation data was properly created."""
        validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE))
        outputs = self.s3_client.list(validate_output_dir)
        outputs = [url_path_join(validate_output_dir, p) for p in outputs]

        # There are 2 courses in the test data.
        self.assertEqual(len(outputs), 2)
    def requires(self):
        """
        Runs each task
        """

        output_destination = url_path_join(self.destination, self.name, str(self.date))

        if self.manifest_path is not None:
            manifest = url_path_join(self.manifest_path, "executive-reports", self.name, str(self.date))
        else:
            manifest = None

        common_parameters = {
            "name": self.name,
            "src": self.src,
            "include": self.include,
            "manifest": manifest,
            "credentials": self.credentials,
            "blacklist": self.blacklist,
            "mapreduce_engine": self.mapreduce_engine,
            "lib_jar": self.lib_jar,
            "n_reduce_tasks": self.n_reduce_tasks,
            "destination": output_destination,
            "date": self.date,
        }

        yield (
            WeeklyAllUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=TOTAL_USERS_AND_ENROLLMENTS_NUM_WEEKS,
                **common_parameters

            ),

            WeeklyIncrementalUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            EnrollmentsByWeek(
                offsets=self.offsets,
                statuses=self.statuses,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            DailyRegistrationsEnrollmentsAndCourses(
                days=DEFAULT_NUM_DAYS,
                **common_parameters
            )
        )
    def __init__(self, *args, **kwargs):
        super(DeidentifiedCourseDumpTask, self).__init__(*args, **kwargs)

        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course)
        auth_userprofile_targets = PathSetTask([url_path_join(self.dump_root, filename_safe_course_id, 'state')],
                                               ['*auth_userprofile*']).output()
        # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that
        dates = [re.search(r"\d{4}-\d{2}-\d{2}", target.path).group() for target in auth_userprofile_targets]
        # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override?
        latest_date = sorted(dates)[-1]
        self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date)
        self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
 def run_obfuscation_task(self):
     """Run ObfuscatedCourseTask."""
     self.task.launch([
         'ObfuscatedCourseTask',
         '--course', self.filename_safe_course_id,
         '--dump-root', self.dump_root,
         '--obfuscated-output-root', url_path_join(self.test_root, 'obfuscated-output'),
         '--format-version', self.FORMAT_VERSION,
         '--pipeline-version', self.PIPELINE_VERSION,
         '--auth-user-path', url_path_join(self.test_root, 'warehouse', 'auth_user'),
         '--auth-userprofile-path', url_path_join(self.test_root, 'warehouse', 'auth_userprofile')
     ])
 def test_answer_distribution(self):
     self.task.launch([
         'AnswerDistributionOneFilePerCourseTask',
         '--src', self.test_src,
         '--dest', url_path_join(self.test_root, 'dst'),
         '--name', 'test',
         '--output-root', self.test_out,
         '--include', '"*"',
         '--manifest', url_path_join(self.test_root, 'manifest.txt'),
         '--base-input-format', self.input_format,
         '--lib-jar', self.oddjob_jar,
         '--n-reduce-tasks', str(self.NUM_REDUCERS),
     ])
     self.validate_output()
    def setUp(self):
        super(FinancialReportsAcceptanceTest, self).setUp()

        for input_file_name in ('paypal.tsv', 'cybersource_test.tsv'):
            src = url_path_join(self.data_dir, 'input', input_file_name)
            dst = url_path_join(self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, input_file_name)
            self.upload_file(src, dst)

        empty_file_path = url_path_join(
            self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, 'cybersource_empty_test.tsv')
        self.upload_file_with_content(empty_file_path, '')

        self.prepare_database('lms', self.import_db)
        self.prepare_database('otto', self.otto_db)
    def test_answer_distribution_mysql(self):
        self.task.launch([
            'AnswerDistributionToMySQLTaskWorkflow',
            '--src', self.test_src,
            '--dest', url_path_join(self.test_root, 'dst'),
            '--name', 'test',
            '--include', '"*"',
            '--manifest', url_path_join(self.test_root, 'manifest.txt'),
            '--base-input-format', self.input_format,
            '--lib-jar', self.oddjob_jar,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--credentials', self.export_db.credentials_file_url,
        ])

        self.validate_output()
    def __init__(self, *args, **kwargs):
        super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs)

        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course)
        dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state')
        auth_userprofile_targets = PathSetTask([dump_path], ['*auth_userprofile*']).output()
        # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that
        dates = [target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets]
        # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override?
        # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error.
        if len(dates) == 0:
            raise Exception('Missing auth_userprofile data file in {}'.format(dump_path))
        latest_date = sorted(dates)[-1]
        self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date)
        self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
    def test_student_engagement(self):
        self.upload_tracking_log(self.INPUT_FILE, datetime.date(2015, 4, 10))
        self.execute_sql_fixture_file('load_student_engagement.sql')

        self.interval = '2015-04-06-2015-04-20'  # run for exactly two weeks

        for interval_type in ['daily', 'weekly', 'all']:

            self.run_task(interval_type)

            for course_id in self.ALL_COURSES:
                hashed_course_id = hashlib.sha1(course_id).hexdigest()
                course_dir = url_path_join(self.test_out, interval_type, hashed_course_id)
                csv_filenames = list(self.s3_client.list(course_dir))

                # Check expected number of CSV files.
                if interval_type == 'daily':
                    self.assertEqual(len(csv_filenames), 14)
                elif interval_type == 'weekly':
                    self.assertEqual(len(csv_filenames), 2)
                elif interval_type == 'all':
                    self.assertEqual(len(csv_filenames), 1)

                # Check that the CSV files contain the expected data.
                for csv_filename in csv_filenames:

                    # Parse expected date from filename.
                    if interval_type == 'all':
                        expected_date = '2015-04-19'
                    else:
                        csv_pattern = '.*student_engagement_.*_(\\d\\d\\d\\d-\\d\\d-\\d\\d)\\.csv'
                        match = re.match(csv_pattern, csv_filename)
                        expected_date = match.group(1)

                    # Build dataframe from csv file generated from events.
                    actual_dataframe = []
                    with S3Target(url_path_join(course_dir, csv_filename)).open() as csvfile:
                        actual_dataframe = read_csv(csvfile)
                        actual_dataframe.fillna('', inplace=True)

                    self.check_engagement_dataframe(actual_dataframe, interval_type, course_id, expected_date)

                    # Validate specific values:
                    expected_dataframe = self.get_expected_engagement(interval_type, hashed_course_id, csv_filename)
                    if expected_dataframe is not None:
                        assert_frame_equal(actual_dataframe, expected_dataframe, check_names=True)
                    else:
                        self.assert_zero_engagement(actual_dataframe)
    def check_validation_events(self):
        """Confirm that validation data was properly created."""
        validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE))
        outputs = self.get_targets_from_remote_path(validate_output_dir)

        # There are 2 courses in the test data.
        self.assertEqual(len(outputs), 2)
    def test_event_log_exports_using_manifest(self):
        config_override = {
            'manifest': {
                'threshold': 1
            }
        }

        folders = {
            'prod': self.PROD_FOLDER,
            'edge': self.EDGE_FOLDER
        }
        for environment in ['prod', 'edge']:
            self.task.launch([
                'EventExportTask',
                '--source', url_path_join(self.test_src, environment),
                '--output-root', self.test_out,
                '--config', self.test_config,
                '--environment', environment,
                '--interval', '2014-05',
                '--gpg-key-dir', self.test_gpg_key_dir,
                '--gpg-master-key', '*****@*****.**',
                '--required-path-text', folders[environment],
                '--n-reduce-tasks', str(self.NUM_REDUCERS),
            ], config_override)

        self.validate_output()
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "incremental_users_and_enrollments_{0}.csv".format(self.name)
         )
     )
 def output(self):
     return get_target_from_url(url_path_join(
         self.output_root,
         'transaction',
         'dt=' + self.import_date.isoformat(),  # pylint: disable=no-member
         'transactions.csv'
     ))
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        with make_encrypted_file(output_file, key_file_targets, progress=report_progress) as encrypted_output_file:
            outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
            try:
                for value in values:
                    outfile.write(value.strip())
                    outfile.write('\n')
                    # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop.
                    # Do not remove it.
                    self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1)
            finally:
                outfile.close()
    def test_demographic_trends(self):
        self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 8, 1))
        self.execute_sql_fixture_file('load_auth_userprofile.sql')

        blacklist_date = '2014-08-29'
        blacklist_url = url_path_join(
            self.warehouse_path, 'course_enrollment_blacklist', 'dt=' + blacklist_date, 'blacklist.tsv')
        with S3Target(blacklist_url).open('w') as s3_file:
            s3_file.write('edX/Open_DemoX/edx_demo_course3')

        config_override = {
            'enrollments': {
                'blacklist_date': blacklist_date,
            }
        }

        self.task.launch([
            'ImportDemographicsIntoMysql',
            '--interval', '2014-08-01-2014-08-06',
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
        ], config_override=config_override)

        self.validate_gender()
        self.validate_birth_year()
        self.validate_education_level()
    def test_end_to_end_without_vertica(self):
        # Similar to test_end_to_end but it excludes the vertica part and it checks data values,
        # not just data shape.
        table_name = 'reconciled_order_transactions'
        output_root = url_path_join(
            self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE
        ) + '/'
        self.task.launch([
            'ReconcileOrdersAndTransactionsTask',
            '--import-date', self.UPPER_BOUND_DATE,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--output-root', output_root,
        ])
        final_output_task = LoadInternalReportingOrderTransactionsToWarehouse(
            import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE)
        )
        columns = [x[0] for x in final_output_task.columns]

        expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv')
        expected = pandas.read_csv(expected_output_csv, parse_dates=True)

        raw_output = self.read_dfs_directory(output_root)
        output = StringIO(raw_output.replace('\t\\N', '\t'))
        data = pandas.read_table(output, header=None, names=columns, parse_dates=True)
        # Re-order dataframe for consistent comparison:
        for frame in (data, expected):
            frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False])
            frame.reset_index(drop=True, inplace=True)

        self.assert_data_frames_equal(data, expected)
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
 def output_path_for_key(self, course_id):
     filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_')
     filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format(
         course_id=filename_safe_course_id,
         dumpdate=self.dump_date,
     )
     return url_path_join(self.output_root, filename)
 def partition_location(self):
     """Provides location of Hive database table's partition data."""
     # The actual folder name where the data is stored is expected to be in the format <key>=<value>
     partition_name = '='.join(self.partition.items()[0])
     # Make sure that input path ends with a slash, to indicate a directory.
     # (This is necessary for S3 paths that are output from Hadoop jobs.)
     return url_path_join(self.table_location, partition_name + '/')
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.user_country_output,
             'dt={0}/'.format(self.interval.date_b.strftime('%Y-%m-%d'))  # pylint: disable=no-member
         )
     )
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.output_root,
             'count-user-activity-per-interval-{interval}.tsv/'.format(interval=self.interval),
         )
     )
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             'total_users_and_enrollments_{0}-{1}.csv'.format(self.start_date, self.date)
         )
     )
 def output(self):
     if len(self.input()['data']) == 0:
         raise IOError("Course File '{filename}' not found for course '{course}'".format(
             filename=self.file_pattern, course=self.course
         ))
     output_filename = os.path.basename(self.input()['data'][0].path)
     return get_target_from_url(url_path_join(self.output_directory, output_filename))
    def upload_public_keys(self):
        gpg_key_dir = os.path.join('gpg-keys')
        for key_filename in os.listdir(gpg_key_dir):
            full_local_path = os.path.join(gpg_key_dir, key_filename)
            remote_url = url_path_join(self.test_gpg_key_dir, key_filename)

            if not key_filename.endswith('.key'):
                self.s3_client.put(full_local_path, remote_url)
 def requires(self):
     table_name = 'courseware_studentmodule'
     return SqoopImportFromMysql(
         credentials=self.credentials,
         destination=url_path_join(self.dest, table_name),
         table_name=table_name,
         num_mappers=self.num_mappers,
         overwrite=self.sqoop_overwrite,
     )
    def upload_data(self):
        """Puts the test course catalog where the processing task would look for it, bypassing calling the actual API"""
        src = os.path.join(self.data_dir, 'input', self.INPUT_FILE)
        # IMPORTANT: this path should be of the same format as the path that DailyPullCatalogTask uses for output.
        dst = url_path_join(self.warehouse_path, "course_catalog", "catalog",
                            "dt=2015-06-29", self.INPUT_FILE)

        # Upload mocked results of the API call
        self.s3_client.put(src, dst)
Example #33
0
 def requires(self):
     table_name = 'courseware_studentmodule'
     return SqoopImportFromMysql(credentials=self.credentials,
                                 destination=url_path_join(
                                     self.dump_root, table_name),
                                 table_name=table_name,
                                 num_mappers=self.num_mappers,
                                 where=self.where,
                                 verbose=self.verbose)
Example #34
0
    def output_path_for_key(self, course_id):
        template = "{course_id}-courseware_studentmodule-{suffix}analytics.sql"

        filename = template.format(
            course_id=opaque_key_util.get_filename_safe_course_id(
                course_id, '-'),
            suffix=(self.output_suffix + '-') if self.output_suffix else '')

        return url_path_join(self.output_root, filename)
    def validate_output_file(self, date, org_id, site, use_master_key=False):
        if use_master_key:
            key_filename = 'insecure_master_secret.key'
        else:
            if org_id == 'edx':
                key_filename = 'insecure_secret.key'
            else:
                key_filename = 'insecure_secret_2.key'

        self.temporary_dir = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.temporary_dir)

        self.downloaded_outputs = os.path.join(self.temporary_dir, 'output')
        os.makedirs(self.downloaded_outputs)

        local_file_name = '{org}-{site}-events-{date}.log'.format(
            org=org_id,
            site=site,
            date=date,
        )

        year = str(date).split("-")[0]

        remote_url = url_path_join(self.test_out, org_id, site, "events", year,
                                   local_file_name + '.gz.gpg')

        # Files won't appear in S3 instantaneously, wait for the files to appear.
        # TODO: exponential backoff
        for _index in range(30):
            key = self.s3_client.get_key(remote_url)
            if key is not None:
                break
            else:
                time.sleep(2)

        if key is None:
            self.fail(
                'Unable to find expected output file {0}'.format(remote_url))

        downloaded_output_path = os.path.join(self.downloaded_outputs,
                                              remote_url.split('/')[-1])
        key.get_contents_to_filename(downloaded_output_path)

        # first decrypt file
        decrypted_file_name = downloaded_output_path[:-len('.gpg')]
        fs.decrypt_file(downloaded_output_path, decrypted_file_name,
                        key_filename)

        # now decompress file
        decompressed_file_name = decrypted_file_name[:-len(',gz')]
        fs.decompress_file(decrypted_file_name, decompressed_file_name)

        shell.run([
            'diff', decompressed_file_name,
            os.path.join(self.data_dir, 'output', local_file_name)
        ])
    def setUp(self):
        super(FinancialReportsAcceptanceTest, self).setUp()

        if not self.should_reset_state:
            return

        for input_file_name in ('paypal.tsv', 'cybersource_test.tsv'):
            src = url_path_join(self.data_dir, 'input', input_file_name)
            dst = url_path_join(self.warehouse_path, "payments",
                                "dt=" + self.IMPORT_DATE, input_file_name)
            self.upload_file(src, dst)

        empty_file_path = url_path_join(self.warehouse_path, "payments",
                                        "dt=" + self.IMPORT_DATE,
                                        'cybersource_empty_test.tsv')
        self.upload_file_with_content(empty_file_path, '')

        self.prepare_database('lms', self.import_db)
        self.prepare_database('otto', self.otto_db)
Example #37
0
    def __init__(self, *args, **kwargs):
        super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs)

        path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity')
        path_targets = PathSetTask([path]).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
    def upload_gpg_keys(self):
        """Uploads test gpg keys, needed for encryption."""
        gpg_key_dir = os.path.join('gpg-keys')
        for key_filename in os.listdir(gpg_key_dir):
            local_filepath = os.path.join(gpg_key_dir, key_filename)
            destination_url = url_path_join(self.test_gpg_key_dir,
                                            key_filename)

            if not key_filename.endswith('.key'):
                self.upload_file(local_filepath, destination_url)
Example #39
0
    def test_student_engagement(self):
        self.upload_tracking_log(self.INPUT_FILE, datetime.date(2015, 4, 10))
        self.execute_sql_fixture_file('load_student_engagement.sql')

        self.interval = '2015-04-06-2015-04-20'  # run for exactly two weeks

        for interval_type in ['daily', 'weekly', 'all']:

            self.run_task(interval_type)

            for course_id in self.ALL_COURSES:
                hashed_course_id = hashlib.sha1(course_id).hexdigest()
                course_dir = url_path_join(self.test_out, interval_type,
                                           hashed_course_id)
                csv_targets = self.get_targets_from_remote_path(course_dir)

                # Check expected number of CSV files.
                if interval_type == 'daily':
                    self.assertEqual(len(csv_targets), 14)
                elif interval_type == 'weekly':
                    self.assertEqual(len(csv_targets), 2)
                elif interval_type == 'all':
                    self.assertEqual(len(csv_targets), 1)

                # Check that the CSV files contain the expected data.
                for csv_target in csv_targets:

                    # Parse expected date from filename.
                    if interval_type == 'all':
                        expected_date = '2015-04-19'
                    else:
                        csv_pattern = '.*student_engagement_.*_(\\d\\d\\d\\d-\\d\\d-\\d\\d)\\.csv'
                        match = re.match(csv_pattern, csv_target.path)
                        expected_date = match.group(1)

                    # Build dataframe from csv file generated from events.
                    actual_dataframe = []
                    with csv_target.open('r') as csvfile:
                        actual_dataframe = read_csv(csvfile)
                        actual_dataframe.fillna('', inplace=True)

                    self.check_engagement_dataframe(actual_dataframe,
                                                    interval_type, course_id,
                                                    expected_date)

                    # Validate specific values:
                    csv_filename = os.path.basename(csv_target.path)
                    expected_dataframe = self.get_expected_engagement(
                        interval_type, hashed_course_id, csv_filename)
                    if expected_dataframe is not None:
                        assert_frame_equal(actual_dataframe,
                                           expected_dataframe,
                                           check_names=True)
                    else:
                        self.assert_zero_engagement(actual_dataframe)
 def output(self):
     """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv"""
     month_year_string = self.run_date.strftime('%Y-%m')  # pylint: disable=no-member
     date_string = self.run_date.strftime('%Y%m%d')  # pylint: disable=no-member
     filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format(
         merchant_id=self.merchant_id,
         date_string=date_string,
         report_format=self.REPORT_FORMAT,
     )
     url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename)
     return get_target_from_url(url_with_filename)
    def output(self):
        """
        Output is set up so it can be read in as a Hive table with partitions.

        The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv
        """
        date_string = self.run_date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        filename = "cybersource_{}.tsv".format(self.merchant_id)
        url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename)
        return get_target_from_url(url_with_filename)
 def run_obfuscated_package_task(self):
     """Run ObfuscatedPackageTask."""
     self.task.launch([
         'ObfuscatedPackageTask', '--course', self.filename_safe_course_id,
         '--obfuscated-output-root',
         url_path_join(self.test_root,
                       'obfuscated-output'), '--gpg-key-dir',
         self.test_gpg_key_dir, '--gpg-master-key', '*****@*****.**',
         '--output-root', self.test_out, '--recipient', '*****@*****.**',
         '--format-version', self.FORMAT_VERSION
     ])
    def setup_state_files(self):
        """Upload input fixture data files, needed to mimic the output produced by course-exporter which is not a part of this test."""

        state_files_dir = os.path.join(self.data_dir, 'input', 'obfuscation',
                                       'state')
        for filename in os.listdir(state_files_dir):
            local_filepath = os.path.join(state_files_dir, filename)
            dst_url = url_path_join(self.dump_root,
                                    self.filename_safe_course_id, 'state',
                                    self.EXPORT_DATE, filename)
            self.upload_file(local_filepath, dst_url)
Example #44
0
    def output_path_for_key(self, course_id):
        """
        Match the course folder hierarchy that is expected by the instructor dashboard.

        The instructor dashboard expects the file to be stored in a folder named sha1(course_id).  All files in that
        directory will be displayed on the instructor dashboard for that course.
        """
        hashed_course_id = hashlib.sha1(course_id).hexdigest()
        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_')
        filename = u'{course_id}_answer_distribution.csv'.format(course_id=filename_safe_course_id)
        return url_path_join(self.output_root, hashed_course_id, filename)
    def output_path_for_key(self, datestamp):
        if not self.tuple_output:
            # Match tracking.log-{datestamp}.gz format.
            filename = u'synthetic_enroll.log-{datestamp}.gz'.format(
                datestamp=datestamp.replace('-', ''), )
        else:
            # Want to have tsv as extension, rather than date.
            filename = u'synthetic_enroll-{datestamp}.tsv.gz'.format(
                datestamp=datestamp.replace('-', ''), )

        return url_path_join(self.output_root, filename)
    def run_export_task(self):
        """
        Preconditions: Populated courseware_studentmodule table in the MySQL database.
        External Effect: Generates a single text file with the contents of courseware_studentmodule from the MySQL
            database for the test course and stores it in S3.

        Intermediate output will be stored in s3://<tasks_output_url>/intermediate/. This directory
            will contain the complete data set from the MySQL database with all courses interleaved in the data files.

        The final output file will be stored in s3://<tasks_output_url>/edX-E929-2014_T1-courseware_studentmodule-acceptance-analytics.sql
        """
        self.task.launch([
            'StudentModulePerCourseAfterImportWorkflow',
            '--credentials', self.import_db.credentials_file_url,
            '--dump-root', url_path_join(self.test_src, 'intermediate'),
            '--output-root', url_path_join(self.test_src, self.ENVIRONMENT),
            '--output-suffix', self.ENVIRONMENT,
            '--num-mappers', str(self.NUM_MAPPERS),
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
        ])
    def setUp(self):
        """Loads enrollment and course catalog fixtures."""
        super(EnrollmentAcceptanceTest, self).setUp()

        self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 7, 30))
        self.execute_sql_fixture_file('load_auth_userprofile.sql')

        self.upload_file(
            os.path.join(self.data_dir, 'input', 'course_catalog.json'),
            url_path_join(self.warehouse_path, 'course_catalog_raw',
                          'dt={}'.format(self.CATALOG_DATE),
                          'course_catalog.json'))
 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(
             self.source_dir):
         files = [
             f for f in os.listdir(self.source_dir) if f.startswith("part")
         ]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)
 def output(self):
     config = configuration.get_config()
     base_url = config.get(CONFIG_SECTION, 'path')
     target = get_target_from_url(
         url_path_join(base_url, str(hash(self))) + '.manifest')
     lib_jar = config.get(CONFIG_SECTION, 'lib_jar', None)
     if lib_jar:
         target.lib_jar = [lib_jar]
     input_format = config.get(CONFIG_SECTION, 'input_format', None)
     if input_format:
         target.input_format = input_format
     return target
Example #50
0
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
Example #51
0
    def upload_data(self):
        """
        Puts the test course structure information where the processing task would look for it, bypassing
        calling the actual API
        """
        src = os.path.join(self.data_dir, 'input', self.INPUT_FILE)
        # IMPORTANT: this path should be of the same format as the path that DailyPullCatalogTask uses for output.
        dst = url_path_join(self.warehouse_path, "courses_raw",
                            self.DATE.strftime('dt=%Y-%m-%d'), self.INPUT_FILE)

        # Upload mocked results of the API call
        self.s3_client.put(src, dst)
Example #52
0
    def _get_required_tasks(self):
        """Internal method to actually calculate required tasks once."""
        start_date = self.interval.date_a
        end_date = self.interval.date_b
        table_name = "student_courseenrollment"
        source_root = url_path_join(self.warehouse_path, table_name)

        current_date = start_date
        while current_date < end_date:
            datestring = current_date.strftime('%Y-%m-%d')
            current_date += datetime.timedelta(days=1)

            src_datestring = "dt={}".format(datestring)
            source_dir = url_path_join(source_root, src_datestring)
            target = get_target_from_url(source_dir)
            if target.exists():
                output_dir = url_path_join(self.output_root, datestring)
                yield CreateEnrollmentValidationEventsTask(
                    source_dir=source_dir,
                    output_root=output_dir,
                    n_reduce_tasks=self.n_reduce_tasks,
                )