コード例 #1
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_list_raw', partition_value=self.partition_value),
             'course_list.json'
         )
     )
コード例 #2
0
 def output(self):
     return get_target_from_url(url_path_join(
         self.output_root,
         'transaction',
         'dt=' + self.import_date.isoformat(),  # pylint: disable=no-member
         'transactions.csv'
     ))
コード例 #3
0
    def credentials(self):
        """The credentials for connecting to the database, read from a URL."""
        if not hasattr(self, '_credentials'):
            with get_target_from_url(self.vertica_creds_url).open('r') as credentials_file:
                self._credentials = json.load(credentials_file)

        return self._credentials
コード例 #4
0
 def _get_required_tasks(self):
     """Internal method to actually calculate required tasks once."""
     start_date = self.interval.date_a  # pylint: disable=no-member
     end_date = self.interval.date_b  # pylint: disable=no-member
     table_name = "student_courseenrollment"
     source_root = url_path_join(self.warehouse_path, table_name)
     today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d')
     current_date = start_date
     while current_date <= end_date:
         datestring = current_date.strftime('%Y-%m-%d')
         current_date += datetime.timedelta(days=1)
         src_datestring = "dt={}".format(datestring)
         source_dir = url_path_join(source_root, src_datestring)
         target = get_target_from_url(source_dir)
         output_dir = url_path_join(self.output_root, datestring)
         if datestring == today_datestring:
             yield CreateEnrollmentValidationEventsForTodayTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
                 credentials=self.credentials,
             )
         elif target.exists():
             yield CreateEnrollmentValidationEventsTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
             )
コード例 #5
0
 def output(self):
     if len(self.input()['data']) == 0:
         raise IOError("Course File '{filename}' not found for course '{course}'".format(
             filename=self.file_pattern, course=self.course
         ))
     output_filename = os.path.basename(self.input()['data'][0].path)
     return get_target_from_url(url_path_join(self.output_directory, output_filename))
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('program_course_order', partition_value=self.date),
             '{0}.tsv'.format('program_course_order')
         )
     )
コード例 #7
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.event_export_counter(counter_title='Bytes Written to Output', incr_value=num_bytes)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        try:
            with make_encrypted_file(output_file, key_file_targets, progress=report_progress,
                                     hadoop_counter_incr_func=self.event_export_counter) as encrypted_output_file:
                outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
                try:
                    for value in values:
                        outfile.write(value.strip())
                        outfile.write('\n')
                        # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite
                        # loop.  Do not remove it.
                        self.event_export_counter(counter_title='Raw Bytes Written', incr_value=(len(value) + 1))
                finally:
                    outfile.close()
        except IOError as err:
            log.error("Error encountered while encrypting and gzipping Organization: %s file: %s Exception: %s",
                      org_id, key_file_targets, err)
            # This counter is set when there is an error during the generation of the encryption file for an
            # organization for any reason, including encryption errors related to an expired GPG key.
            self.event_export_counter(counter_title="{} org with Errors".format(org_id), incr_value=1)
コード例 #8
0
 def output(self):
     output_root = url_path_join(
         self.warehouse_path,
         self.partition_task.hive_table_task.table,
         self.partition.path_spec + '/'
     )
     return get_target_from_url(output_root, marker=True)
コード例 #9
0
    def run_job(self, job):
        job.init_hadoop()
        job.init_mapper()
        map_output = StringIO.StringIO()
        input_targets = luigi.task.flatten(job.input_hadoop())
        for input_target in input_targets:
            # if file is a directory, then assume that it's Hadoop output,
            # and actually loop through its contents:
            if os.path.isdir(input_target.path):
                filenames = os.listdir(input_target.path)
                for filename in filenames:
                    url = url_path_join(input_target.path, filename)
                    input_targets.append(get_target_from_url(url.strip()))
                continue

            with input_target.open('r') as input_file:

                # S3 files not yet supported since they don't support tell() and seek()
                if input_target.path.endswith('.gz'):
                    input_file = gzip.GzipFile(fileobj=input_file)
                elif input_target.path.endswith('.manifest'):
                    for url in input_file:
                        input_targets.append(get_target_from_url(url.strip()))
                    continue

                os.environ['map_input_file'] = input_target.path
                try:
                    outputs = job._map_input((line[:-1] for line in input_file))
                    job.internal_writer(outputs, map_output)
                finally:
                    del os.environ['map_input_file']

        map_output.seek(0)

        reduce_input = self.group(map_output)
        try:
            reduce_output = job.output().open('w')
        except Exception:
            reduce_output = StringIO.StringIO()

        try:
            job._run_reducer(reduce_input, reduce_output)
        finally:
            try:
                reduce_output.close()
            except Exception:
                pass
コード例 #10
0
def remove_manifest_target_if_exists(manifest_id):
    """Given an id and configuration, construct a target that can check and remove a manifest file."""
    manifest_file_path = get_manifest_file_path(manifest_id)
    # we don't need the mixin in order to check for existence or to remove the manifest file.
    manifest_target = get_target_from_url(manifest_file_path)
    if manifest_target.exists():
        log.info('Removing existing manifest found at %s', manifest_target.path)
        manifest_target.remove()
コード例 #11
0
 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir):
         files = [f for f in os.listdir(self.source_dir) if f.startswith("part")]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)
コード例 #12
0
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
コード例 #13
0
ファイル: video.py プロジェクト: edx/edx-analytics-pipeline
    def run(self):
        # Remove the marker file.
        self.remove_output_on_overwrite()
        # Also remove actual output files in case of overwrite.
        if self.overwrite:
            for date in self.overwrite_interval:
                url = self.output_path_for_key(date.isoformat())
                target = get_target_from_url(url)
                if target.exists():
                    target.remove()

        super(UserVideoViewingByDateTask, self).run()

        # Make sure an output file exists for each day within the interval.
        for date in self.overwrite_interval:
            url = self.output_path_for_key(date.isoformat())
            target = get_target_from_url(url)
            if not target.exists():
                target.open("w").close()  # touch the file
コード例 #14
0
def read_config_file(filename):
    """Read a config file from either an external source (S3, HDFS etc) or the "share" directory of this repo."""
    if os.path.basename(filename) != filename:
        target = get_target_from_url(filename)
        with target.open('r') as config_file:
            yield config_file
    else:
        file_path = os.path.join(sys.prefix, 'share', 'edx.analytics.tasks', filename)
        with open(file_path, 'r') as config_file:
            yield config_file
コード例 #15
0
    def output(self):
        """
        Output is set up so that it can be read as a Hive table with partitions,

        The form is {warehouse_path}/course_catalog_api/subjects/dt={CCYY-mm-dd}/subjects.tsv.
        """
        date_string = self.date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        url_with_filename = url_path_join(self.warehouse_path, "course_catalog", "subjects",
                                          partition_path_spec, "subjects.tsv")
        return get_target_from_url(url_with_filename)
コード例 #16
0
    def run(self):
        self.remove_output_on_overwrite()
        super(LastDailyIpAddressOfUserTask, self).run()

        # This makes sure that a output file exists for each date in the interval
        # as downstream tasks require that they exist (as provided by downstream_input_tasks()).
        for date in self.interval:
            url = self.output_path_for_key(date.isoformat())
            target = get_target_from_url(url)
            if not target.exists():
                target.open("w").close()  # touch the file
コード例 #17
0
    def output(self):
        """
        Output is set up so it can be read in as a Hive table with partitions.

        The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv
        """
        date_string = self.run_date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        filename = "cybersource_{}.tsv".format(self.merchant_id)
        url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename)
        return get_target_from_url(url_with_filename)
コード例 #18
0
 def output(self):
     """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv"""
     month_year_string = self.run_date.strftime('%Y-%m')  # pylint: disable=no-member
     date_string = self.run_date.strftime('%Y%m%d')  # pylint: disable=no-member
     filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format(
         merchant_id=self.merchant_id,
         date_string=date_string,
         report_format=self.REPORT_FORMAT,
     )
     url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename)
     return get_target_from_url(url_with_filename)
コード例 #19
0
 def __init__(self, *args, **kwargs):
     super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs)
     if self.delete_output_root:
         # If requested, make sure that the output directory is empty.  This gets rid
         # of any generated data files from a previous run (that might not get
         # regenerated in this run).  It also makes sure that the marker file
         # (i.e. the output target) will be removed, so that external functionality
         # will know that the generation of data files is not complete.
         output_dir_target = get_target_from_url(self.output_root)
         for target in [self.output(), output_dir_target]:
             if target.exists():
                 target.remove()
コード例 #20
0
    def run(self):
        # Remove the marker file.
        self.remove_output_on_overwrite()
        # Also remove actual output files in case of overwrite.
        if self.overwrite:
            for date in self.interval:
                url = self.output_path_for_key(date.isoformat())
                target = get_target_from_url(url)
                if target.exists():
                    target.remove()

        return super(UserActivityTask, self).run()
コード例 #21
0
    def reducer(self, key, values):
        """
        Write out values from each key into different output files.
        """
        output_path = self.output_path_for_key(key)
        if output_path:
            log.info('Writing output file: %s', output_path)
            output_file_target = get_target_from_url(output_path)
            with output_file_target.open('w') as output_file:
                self.multi_output_reducer(key, values, output_file)

        # Luigi requires the reducer to return an iterable
        return iter(tuple())
コード例 #22
0
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
コード例 #23
0
 def output(self):
     return get_target_from_url(url_path_join(self.output_root, 'temp', 'CountCourseEnrollments/'))
コード例 #24
0
 def output(self):
     output_name = u'answer_distribution_per_course_{name}/'.format(name=self.name)
     return get_target_from_url(url_path_join(self.dest, output_name))
コード例 #25
0
 def output(self):
     return get_target_from_url(self.partition_location.rstrip('/') + '/')
コード例 #26
0
 def get_table_metadata_target(self):
     """Returns target for metadata file from the given dump."""
     # find the .metadata file in the source directory.
     metadata_path = url_path_join(self.s3_location_for_table,
                                   METADATA_FILENAME)
     return get_target_from_url(metadata_path)
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path(self.table_name, partition_value=self.date), '{0}.tsv'.format(self.table_name)
         )
     )
コード例 #28
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "incremental_users_and_enrollments_{0}.csv".format(self.name)))
コード例 #29
0
 def output(self):
     marker_url = url_path_join(self.marker, str(hash(self)))
     return get_target_from_url(marker_url)
コード例 #30
0
 def output(self):
     filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(
         self.course)
     return get_target_from_url(
         url_path_join(self.obfuscated_output_root, self.format_version,
                       filename_safe_course_id, 'metadata_file.json'))
コード例 #31
0
 def complete(self):
     """
     The task is complete if the output_root/_SUCCESS file is present.
     """
     return get_target_from_url(url_path_join(self.output_root, '_SUCCESS')).exists()
コード例 #32
0
def get_target_for_local_server(url):
    # The machine running the acceptance test suite may not have hadoop installed on it, so convert S3 paths (which
    # are normally handled by the hadoop DFS client) to S3+https paths, which are handled by the python native S3
    # client.
    return get_target_from_url(url.replace('s3://', 's3+https://'))
コード例 #33
0
 def upload_file_with_content(self, remote_file_path, content):
     log.debug('Writing %s from string', remote_file_path)
     with get_target_from_url(remote_file_path).open('w') as remote_file:
         remote_file.write(content)
コード例 #34
0
 def upload_file(self, local_file_name, remote_file_path):
     log.debug('Uploading %s to %s', local_file_name, remote_file_path)
     with get_target_from_url(remote_file_path).open('w') as remote_file:
         with open(local_file_name, 'r') as local_file:
             shutil.copyfileobj(local_file, remote_file)
 def output(self):
     return get_target_from_url(self.table_location)
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('programs_raw',
                                      partition_value=self.date),
             'programs.json'))
コード例 #37
0
ファイル: sqoop.py プロジェクト: edx/edx-analytics-pipeline
 def metadata_output(self):
     """Return target to which metadata about the task execution can be written."""
     return get_target_from_url(url_path_join(self.destination, METADATA_FILENAME))
 def output(self):
     output_url = self.hive_partition_path('active_users_per_week', self.interval.date_b)
     return get_target_from_url(output_url)
コード例 #39
0
 def output(self):
     output_name = u'problem_check_events_{name}/'.format(name=self.name)
     return get_target_from_url(url_path_join(self.dest, output_name))
コード例 #40
0
 def complete(self):
     if self.overwrite and not self.attempted_removal:
         return False
     else:
         return get_target_from_url(
             url_path_join(self.output_url(), '_SUCCESS')).exists()
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_catalog_raw',
                                      partition_value=self.date),
             'course_catalog.json'))
コード例 #42
0
 def output(self):
     return get_target_from_url(url_path_join(self.output_root, 'temp/CountProgramCohortEnrollments/'))
コード例 #43
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "daily_registrations_enrollments_{0}.csv".format(self.name)))
コード例 #44
0
 def output(self):
     return get_target_from_url(
         url_path_join(self.output_root,
                       self.filename_safe_course_id + '.tar.gz.gpg'))
コード例 #45
0
 def output(self):
     url_with_filename = url_path_join(self.destination, self.filename)
     return get_target_from_url(url_with_filename)
コード例 #46
0
 def output(self):
     return get_target_from_url(self.s3_location_for_table)
コード例 #47
0
 def output(self):
     marker_url = url_path_join(self.marker, str(hash(self)))
     return get_target_from_url(marker_url)
コード例 #48
0
 def output(self):
     url_with_filename = url_path_join(self.destination, self.filename)
     return get_target_from_url(url_with_filename)
コード例 #49
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('discovery_api_raw',
                                      partition_value=self.date),
             'courses.json'))
コード例 #50
0
 def output(self):  # pragma: no cover
     output_root = url_path_join(self.warehouse_path,
                                 self.partition_task.hive_table_task.table,
                                 self.partition.path_spec + '/')
     return get_target_from_url(output_root, marker=True)
コード例 #51
0
 def output(self):
     return get_target_from_url(self.output_root)
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('program_course_order',
                                      partition_value=self.date),
             '{0}.tsv'.format('program_course_order')))
コード例 #53
0
 def output(self):
     """Expose the data location target as the output."""
     return get_target_from_url(self.output_root)
コード例 #54
0
 def complete(self):
     """
     The task is complete if the output_root/_SUCCESS file is present.
     """
     return get_target_from_url(url_path_join(self.output_root,
                                              '_SUCCESS')).exists()
コード例 #55
0
ファイル: sqoop.py プロジェクト: edx/edx-analytics-pipeline
 def output(self):
     return get_target_from_url(self.destination + '/')
コード例 #56
0
 def complete(self):
     """
     The task is complete if the output_root is present.
     """
     return get_target_from_url(self.output_root).exists()
コード例 #57
0
ファイル: sqoop.py プロジェクト: edx/edx-analytics-pipeline
 def marker_output(self):
     """Return target for _SUCCESS marker indicating the task was successfully completed."""
     return get_target_from_url(url_path_join(self.destination, "_SUCCESS"))
コード例 #58
0
 def input_hadoop(self):
     # NOTE: The hadoop job needs the raw data to use as input, not the hive partition metadata, which is the output
     # of the partition task
     return get_target_from_url(self.requires().output_root)
コード例 #59
0
 def output(self):
     return get_target_from_url(self.output_root)
コード例 #60
0
 def output(self):
     """
     Use the marker location as an indicator of task "completeness".
     """
     return get_target_from_url(self.marker)