def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.',
                                 dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(
                        self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(
                        course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory,
                                                   relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                        output_file,
                        key_file_targets,
                        progress=report_encrypt_progress,
                        dir=self.temporary_dir) as encrypted_output_file:
                    with tarfile.open(mode='w:gz',
                                      fileobj=encrypted_output_file
                                      ) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
    def __init__(self, *args, **kwargs):
        super(IntervalPullFromCybersourceTask, self).__init__(*args, **kwargs)
        # Provide default for output_root at this level.
        if self.output_root is None:
            self.output_root = self.warehouse_path

        path = url_path_join(self.warehouse_path, 'payments')
        file_pattern = '*cybersource_{}.tsv'.format(self.merchant_id)
        path_targets = PathSetTask([path],
                                   include=[file_pattern],
                                   include_zero_length=True).output()
        paths = list(
            set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        latest_completion_date = datetime.datetime.strptime(
            latest_date, "dt=%Y-%m-%d").date()
        run_date = latest_completion_date + datetime.timedelta(days=1)

        # Limit intervals to merchant account close date(if any).
        if self.merchant_close_date:
            run_date = min(run_date, self.merchant_close_date)
            self.interval_end = min(self.interval_end,
                                    self.merchant_close_date)

        self.selection_interval = date_interval.Custom(self.interval_start,
                                                       run_date)
        self.run_interval = date_interval.Custom(run_date, self.interval_end)
Example #3
0
    def __init__(self, *args, **kwargs):
        super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs)

        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(
            self.course)
        dump_path = url_path_join(self.dump_root, filename_safe_course_id,
                                  'state')
        auth_userprofile_targets = PathSetTask(
            [dump_path], ['*auth_userprofile*']).output()
        # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that
        dates = [
            target.path.rsplit('/', 2)[-2]
            for target in auth_userprofile_targets
        ]
        # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override?
        # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error.
        if len(dates) == 0:
            raise Exception(
                'Missing auth_userprofile data file in {}'.format(dump_path))
        latest_date = sorted(dates)[-1]
        self.data_directory = url_path_join(self.dump_root,
                                            filename_safe_course_id, 'state',
                                            latest_date)
        self.output_directory = url_path_join(self.output_root,
                                              filename_safe_course_id, 'state',
                                              latest_date)
Example #4
0
 def requires(self):
     base_reqs = {
         # We want to process files that are zero-length.
         'data': PathSetTask([self.data_directory], [self.file_pattern], include_zero_length=True)
     }
     base_reqs.update(self.user_info_requirements())
     return base_reqs
Example #5
0
 def get_targets_from_remote_path(remote_path, pattern='*'):
     output_targets = PathSetTask([remote_path], [pattern]).output()
     modified = [
         modify_target_for_local_server(output_target)
         for output_target in output_targets
     ]
     return modified
Example #6
0
    def __init__(self, *args, **kwargs):
        super(IntervalPullFromAffiliateWindowTask,
              self).__init__(*args, **kwargs)

        # Provide default for output_root at this level.
        if self.output_root is None:
            self.output_root = url_path_join(self.warehouse_path, 'fees',
                                             'affiliate_window')

        path = self.output_root
        file_pattern = '*affiliate_window.tsv'
        path_targets = PathSetTask([path],
                                   include=[file_pattern],
                                   include_zero_length=True).output()

        if path_targets:
            paths = list(
                set([os.path.dirname(target.path) for target in path_targets]))
            dates = [path.rsplit('/', 2)[-1] for path in paths]
            latest_date = sorted(dates)[-1]
            latest_completion_date = datetime.datetime.strptime(
                latest_date, "dt=%Y-%m-%d").date()
            self.interval_start = latest_completion_date + datetime.timedelta(
                days=1)
            print("Found previous reports to {}".format(latest_date))
        else:
            # If this is the first run, start from the beginning
            print(
                "Couldn't find last completed date, defaulting to start date: {}"
                .format(self.interval_start))

        self.run_interval = date_interval.Custom(self.interval_start,
                                                 self.interval_end)
        print("Running reports from interval {}".format(self.run_interval))
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
Example #8
0
 def clean_xml_files(self, root_dir):
     """Find all of the XML files in the package and remove any unrecognized or known sensitive fields from them."""
     log.debug('Cleaning XML files')
     xml_file_paths = [target.path for target in PathSetTask([root_dir], ['*.xml']).output()]
     for xml_file_path in xml_file_paths:
         document = xml.etree.ElementTree.parse(xml_file_path)
         element = document.getroot()
         self.clean_element(element)
         document.write(xml_file_path)
Example #9
0
    def __init__(self, *args, **kwargs):
        super(LoadInternalReportingCourseStructureToSnowflake, self).__init__(*args, **kwargs)
        path = url_path_join(self.warehouse_path, 'course_block_records')
        path_targets = PathSetTask([path]).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
 def obfuscate_directory(self, input_dir, output_dir):
     if output_dir is not None:
         create_directory(output_dir)
     if self.parameters['wiki']:
         for filepath in glob.glob(os.path.join(input_dir, '*wiki_articlerevision-prod-analytics.sql')):
             self.obfuscate_wiki_file(filepath, output_dir)
     if self.parameters['courseware']:
         for filepath in glob.glob(os.path.join(input_dir, '*courseware_studentmodule-prod-analytics.sql')):
             self.obfuscate_courseware_file(filepath, output_dir)
     if self.parameters['forum']:
         for filepath in glob.glob(os.path.join(input_dir, '*.mongo')):
             self.obfuscate_forum_file(filepath, output_dir)
     if self.parameters['event']:
         # This is generalized beyond localfs/glob.
         task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz'])
         requirements = task.requires()
         for requirement in requirements:
             self.obfuscate_event_file(requirement.output(), output_dir)
    def __init__(self, *args, **kwargs):
        super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs)

        path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity')
        path_targets = PathSetTask([path]).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
Example #12
0
    def __init__(self, *args, **kwargs):
        super(ExternalCourseEnrollmentSummaryPartitionTask, self).__init__(*args, **kwargs)

        # Find the most recent data for the source.
        path = url_path_join(self.warehouse_path, 'course_enrollment_summary')
        path_targets = PathSetTask([path]).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]
        self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
Example #13
0
 def obfuscate_directory(self, input_dir, output_dir):
     if output_dir is not None:
         create_directory(output_dir)
     if self.parameters['wiki']:
         for filepath in glob.glob(
                 os.path.join(input_dir,
                              '*wiki_articlerevision-prod-analytics.sql')):
             self.obfuscate_wiki_file(filepath, output_dir)
     if self.parameters['courseware']:
         for filepath in glob.glob(
                 os.path.join(
                     input_dir,
                     '*courseware_studentmodule-prod-analytics.sql')):
             self.obfuscate_courseware_file(filepath, output_dir)
     if self.parameters['forum']:
         for filepath in glob.glob(os.path.join(input_dir, '*.mongo')):
             self.obfuscate_forum_file(filepath, output_dir)
     if self.parameters['event']:
         # This is generalized beyond localfs/glob.
         task = PathSetTask(src=[input_dir], include=['*-events-*.log.gz'])
         requirements = task.requires()
         for requirement in requirements:
             self.obfuscate_event_file(requirement.output(), output_dir)
 def __init__(self, *args, **kwargs):
     super(LoadHiveTableToVertica, self).__init__(*args, **kwargs)
     # Find the most recent data for the source if load from latest partition is enabled.
     if self.load_from_latest_partition:
         path = url_path_join(self.warehouse_path, self.table_name)
         path_targets = PathSetTask([path]).output()
         paths = list(
             set([os.path.dirname(target.path) for target in path_targets]))
         dates = [path.rsplit('/', 2)[-1] for path in paths]
         latest_date = sorted(dates)[-1]
         self.latest_date = datetime.datetime.strptime(
             latest_date, "dt=%Y-%m-%d").date()
         log.debug('Loading data for table %s from partition %s',
                   self.table_name, self.latest_date)
Example #15
0
    def __init__(self, *args, **kwargs):
        super(PaypalTransactionsIntervalTask, self).__init__(*args, **kwargs)
        # Provide default for output_root at this level.
        if self.output_root is None:
            self.output_root = self.warehouse_path

        path = url_path_join(self.warehouse_path, 'payments')
        path_targets = PathSetTask([path], include=['*paypal.tsv']).output()
        paths = list(set([os.path.dirname(target.path) for target in path_targets]))
        dates = [path.rsplit('/', 2)[-1] for path in paths]
        latest_date = sorted(dates)[-1]

        latest_completion_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
        run_date = latest_completion_date + datetime.timedelta(days=1)

        self.selection_interval = date_interval.Custom(self.interval_start, run_date)
        self.run_interval = date_interval.Custom(run_date, self.interval_end)
Example #16
0
 def requires(self):
     return PathSetTask(self.src, self.include, self.manifest)
 def requires(self):
     return PathSetTask(self.dump_root)
Example #18
0
 def requires(self):
     filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(
         self.course)
     event_files_url = url_path_join(self.dump_root,
                                     filename_safe_course_id, 'events')
     return PathSetTask([event_files_url], ['*'])