コード例 #1
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.user_country_output,
             'dt={0}/'.format(self.interval.date_b.strftime('%Y-%m-%d'))  # pylint: disable=no-member
         )
     )
コード例 #2
0
 def _get_required_tasks(self):
     """Internal method to actually calculate required tasks once."""
     start_date = self.interval.date_a
     end_date = self.interval.date_b
     table_name = "student_courseenrollment"
     source_root = url_path_join(self.warehouse_path, table_name)
     today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d')
     current_date = start_date
     while current_date <= end_date:
         datestring = current_date.strftime('%Y-%m-%d')
         current_date += datetime.timedelta(days=1)
         src_datestring = "dt={}".format(datestring)
         source_dir = url_path_join(source_root, src_datestring)
         target = get_target_from_url(source_dir)
         output_dir = url_path_join(self.output_root, datestring)
         if datestring == today_datestring:
             yield CreateEnrollmentValidationEventsForTodayTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
                 credentials=self.credentials,
             )
         elif target.exists():
             yield CreateEnrollmentValidationEventsTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
             )
コード例 #3
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.output_root,
             'count-user-activity-per-interval-{interval}.tsv/'.format(interval=self.interval),
         )
     )
コード例 #4
0
    def validate_obfuscation(self):
        """Validates obfuscation workflow."""
        output_target = PathSetTask([self.test_out], ['*.tar.gz.gpg']).output()[0]
        output_filename = os.path.basename(output_target.path)
        output_filepath = os.path.join(self.temporary_dir, output_filename)

        if output_target.path.startswith('s3://'):
            output_target = get_target_from_url(output_target.path.replace('s3://', 's3+https://'))

        with output_target.open('r') as input_file:
            with open(output_filepath, 'w') as output_file:
                copy_file_to_file(input_file, output_file)

        decrypted_filepath = output_filepath[:-len('.gpg')]
        fs.decrypt_file(output_filepath, decrypted_filepath, 'insecure_secret.key')

        with tarfile.open(decrypted_filepath, 'r:gz') as tfile:
            tfile.extractall(self.temporary_dir)

        # Validate package metadata info.
        metadata_filepath = os.path.join(self.temporary_dir, 'metadata_file.json')
        with open(metadata_filepath) as metadata_file:
            metadata_info = json.load(metadata_file)
        self.assertItemsEqual(metadata_info['format_version'], self.FORMAT_VERSION)
        self.assertItemsEqual(metadata_info['pipeline_version'], self.PIPELINE_VERSION)

        self.validate_data_obfuscation()
        self.validate_events_obfuscation()
コード例 #5
0
 def output(self):
     if len(self.input()['data']) == 0:
         raise IOError("Course File '{filename}' not found for course '{course}'".format(
             filename=self.file_pattern, course=self.course
         ))
     output_filename = os.path.basename(self.input()['data'][0].path)
     return get_target_from_url(url_path_join(self.output_directory, output_filename))
コード例 #6
0
 def output(self):
     return get_target_from_url(url_path_join(
         self.output_root,
         'transaction',
         'dt=' + self.import_date.isoformat(),  # pylint: disable=no-member
         'transactions.csv'
     ))
コード例 #7
0
    def credentials(self):
        """The credentials for connecting to the database, read from a URL."""
        if not hasattr(self, '_credentials'):
            with get_target_from_url(self.vertica_creds_url).open('r') as credentials_file:
                self._credentials = json.load(credentials_file)

        return self._credentials
コード例 #8
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "incremental_users_and_enrollments_{0}.csv".format(self.name)
         )
     )
コード例 #9
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             'total_users_and_enrollments_{0}-{1}.csv'.format(self.start_date, self.date)
         )
     )
コード例 #10
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        with make_encrypted_file(output_file, key_file_targets, progress=report_progress) as encrypted_output_file:
            outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
            try:
                for value in values:
                    outfile.write(value.strip())
                    outfile.write('\n')
                    # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop.
                    # Do not remove it.
                    self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1)
            finally:
                outfile.close()
コード例 #11
0
ファイル: mapreduce.py プロジェクト: Zarana-Parekh/analytics
    def run_job(self, job):
        job.init_hadoop()
        job.init_mapper()
        map_output = StringIO.StringIO()
        input_targets = luigi.task.flatten(job.input_hadoop())
        for input_target in input_targets:
            # if file is a directory, then assume that it's Hadoop output,
            # and actually loop through its contents:
            if os.path.isdir(input_target.path):
                filenames = os.listdir(input_target.path)
                for filename in filenames:
                    url = url_path_join(input_target.path, filename)
                    input_targets.append(get_target_from_url(url.strip()))
                continue

            with input_target.open('r') as input_file:

                # S3 files not yet supported since they don't support tell() and seek()
                if input_target.path.endswith('.gz'):
                    input_file = gzip.GzipFile(fileobj=input_file)
                elif input_target.path.endswith('.manifest'):
                    for url in input_file:
                        input_targets.append(get_target_from_url(url.strip()))
                    continue

                os.environ['map_input_file'] = input_target.path
                try:
                    outputs = job._map_input((line[:-1] for line in input_file))
                    job.internal_writer(outputs, map_output)
                finally:
                    del os.environ['map_input_file']

        map_output.seek(0)

        reduce_input = self.group(map_output)
        try:
            reduce_output = job.output().open('w')
        except Exception:
            reduce_output = StringIO.StringIO()

        try:
            job._run_reducer(reduce_input, reduce_output)
        finally:
            try:
                reduce_output.close()
            except Exception:
                pass
コード例 #12
0
    def output(self):
        if len(self.input()) == 0:
            raise IOError("Course File '{filename}' not found for course '{course}'".format(
                filename=self.file_pattern, course=self.course
            ))

        # TODO: should we change the filename to indicate that it has been de-identified?
        output_filename = os.path.basename(self.input()[0].path)
        return get_target_from_url(url_path_join(self.output_directory, output_filename))
コード例 #13
0
 def reset_external_state(self):
     root_target = get_target_from_url(get_jenkins_safe_url(self.test_root))
     if root_target.exists():
         root_target.remove()
     self.import_db.reset()
     self.export_db.reset()
     self.otto_db.reset()
     self.hive.reset()
     self.vertica.reset()
コード例 #14
0
 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir):
         files = [f for f in os.listdir(self.source_dir) if f.startswith("part")]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)
コード例 #15
0
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
コード例 #16
0
def when_geolocation_data_available(function):
    config = get_test_config()
    geolocation_data = config.get('geolocation_data')
    geolocation_data_available = bool(geolocation_data)
    if geolocation_data_available:
        geolocation_data_available = get_target_from_url(get_jenkins_safe_url(geolocation_data)).exists()
    return unittest.skipIf(
        not geolocation_data_available, 'Geolocation data is not available'
    )(function)
コード例 #17
0
def read_config_file(filename):
    """Read a config file from either an external source (S3, HDFS etc) or the "share" directory of this repo."""
    if os.path.basename(filename) != filename:
        target = get_target_from_url(filename)
        with target.open('r') as config_file:
            yield config_file
    else:
        file_path = os.path.join(sys.prefix, 'share', 'edx.analytics.tasks', filename)
        with open(file_path, 'r') as config_file:
            yield config_file
    def output(self):
        """
        Output is set up so that it can be read as a Hive table with partitions.

        The form is {warehouse_path}/course_structure/dt={CCYY-mm-dd}/courses.tsv.
        """
        date_string = self.run_date.strftime("%Y-%m-%d")  # pylint: disable=no-member
        partition_path_spec = HivePartition("dt", date_string).path_spec
        url_with_filename = url_path_join(self.warehouse_path, "course_structure", partition_path_spec, "courses.tsv")
        return get_target_from_url(url_with_filename)
コード例 #19
0
    def run(self):
        self.remove_output_on_overwrite()
        super(CourseEnrollmentEventsTask, self).run()

        # This makes sure that a output file exists for each date in the interval
        # as downstream tasks require that they exist.
        for date in self.interval:
            url = self.output_path_for_key(date.isoformat())
            target = get_target_from_url(url)
            if not target.exists():
                target.open("w").close()  # touch the file
コード例 #20
0
 def output(self):
     """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv"""
     month_year_string = self.run_date.strftime('%Y-%m')  # pylint: disable=no-member
     date_string = self.run_date.strftime('%Y%m%d')  # pylint: disable=no-member
     filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format(
         merchant_id=self.merchant_id,
         date_string=date_string,
         report_format=self.REPORT_FORMAT,
     )
     url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename)
     return get_target_from_url(url_with_filename)
コード例 #21
0
    def run(self):
        self.remove_output_on_overwrite()
        super(LastDailyIpAddressOfUserTask, self).run()

        # This makes sure that a output file exists for each date in the interval
        # as downstream tasks require that they exist (as provided by downstream_input_tasks()).
        for date in self.interval:
            url = self.output_path_for_key(date.isoformat())
            target = get_target_from_url(url)
            if not target.exists():
                target.open("w").close()  # touch the file
コード例 #22
0
    def output(self):
        """
        Output is set up so it can be read in as a Hive table with partitions.

        The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv
        """
        date_string = self.run_date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        filename = "cybersource_{}.tsv".format(self.merchant_id)
        url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename)
        return get_target_from_url(url_with_filename)
コード例 #23
0
    def output(self):
        """
        Output is set up so that it can be read as a Hive table with partitions,

        The form is {warehouse_path}/course_catalog_api/subjects/dt={CCYY-mm-dd}/subjects.tsv.
        """
        date_string = self.date.strftime('%Y-%m-%d')  # pylint: disable=no-member
        partition_path_spec = HivePartition('dt', date_string).path_spec
        url_with_filename = url_path_join(self.warehouse_path, "course_catalog", "subjects",
                                          partition_path_spec, "subjects.tsv")
        return get_target_from_url(url_with_filename)
コード例 #24
0
 def output(self):
     config = configuration.get_config()
     base_url = config.get(CONFIG_SECTION, 'path')
     target = get_target_from_url(url_path_join(base_url, str(hash(self))) + '.manifest')
     lib_jar = config.get(CONFIG_SECTION, 'lib_jar', None)
     if lib_jar:
         target.lib_jar = [lib_jar]
     input_format = config.get(CONFIG_SECTION, 'input_format', None)
     if input_format:
         target.input_format = input_format
     return target
コード例 #25
0
    def reducer(self, key, values):
        """
        Write out values from each key into different output files.
        """
        output_path = self.output_path_for_key(key)
        if output_path:
            output_file_target = get_target_from_url(output_path)
            with output_file_target.open('w') as output_file:
                self.multi_output_reducer(key, values, output_file)

        # Luigi requires the reducer to return an iterable
        return iter(tuple())
コード例 #26
0
 def reset_external_state(self):
     # The machine running the acceptance test suite may not have hadoop installed on it, so convert S3 paths (which
     # are normally handled by the hadoop DFS client) to S3+https paths, which are handled by the python native S3
     # client.
     root_target = get_target_from_url(self.test_root.replace('s3://', 's3+https://'))
     if root_target.exists():
         root_target.remove()
     self.import_db.reset()
     self.export_db.reset()
     self.otto_db.reset()
     self.hive.reset()
     self.vertica.reset()
コード例 #27
0
ファイル: mapreduce.py プロジェクト: Zarana-Parekh/analytics
 def __init__(self, *args, **kwargs):
     super(MultiOutputMapReduceJobTask, self).__init__(*args, **kwargs)
     if self.delete_output_root:
         # If requested, make sure that the output directory is empty.  This gets rid
         # of any generated data files from a previous run (that might not get
         # regenerated in this run).  It also makes sure that the marker file
         # (i.e. the output target) will be removed, so that external functionality
         # will know that the generation of data files is not complete.
         output_dir_target = get_target_from_url(self.output_root)
         for target in [self.output(), output_dir_target]:
             if target.exists():
                 target.remove()
コード例 #28
0
    def test_end_to_end_without_vertica(self):
        # Similar to test_end_to_end but it excludes the vertica part and it checks data values,
        # not just data shape.
        table_name = 'reconciled_order_transactions'
        output_root = url_path_join(
            self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE
        ) + '/'
        self.task.launch([
            'ReconcileOrdersAndTransactionsTask',
            '--import-date', self.UPPER_BOUND_DATE,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--output-root', output_root,
        ])
        final_output_task = LoadInternalReportingOrderTransactionsToWarehouse(
            import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE)
        )
        columns = [x[0] for x in final_output_task.columns]
        output_targets = PathSetTask([output_root], ['*']).output()
        raw_output = ""
        for output_target in output_targets:
            output_target = get_target_from_url(get_jenkins_safe_url(output_target.path))
            raw_output += output_target.open('r').read()

        expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv')
        expected = pandas.read_csv(expected_output_csv, parse_dates=True)

        output = StringIO(raw_output.replace('\t\\N', '\t'))
        data = pandas.read_table(output, header=None, names=columns, parse_dates=True)
        # Re-order dataframe for consistent comparison:
        for frame in (data, expected):
            frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False])
            frame.reset_index(drop=True, inplace=True)
        try:
            assert_frame_equal(data, expected)
        except AssertionError:
            pandas.set_option('display.max_columns', None)
            print('----- The report generated this data: -----')
            print(data)
            print('----- vs expected: -----')
            print(expected)
            if data.shape != expected.shape:
                print("Data shapes differ.")
            else:
                for index, series in data.iterrows():
                    # Try to print a more helpful/localized difference message:
                    try:
                        assert_series_equal(data.iloc[index, :], expected.iloc[index, :])
                    except AssertionError:
                        print("First differing row: {index}".format(index=index))
            raise
コード例 #29
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self._get_recipients(org_id)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        with make_encrypted_file(output_file, key_file_targets) as encrypted_output_file:
            outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
            try:
                for value in values:
                    outfile.write(value.strip())
                    outfile.write('\n')
            finally:
                outfile.close()
コード例 #30
0
    def run(self):
        recipients = set(self.recipient)
        if self.gpg_master_key is not None:
            recipients.add(self.gpg_master_key)
        key_file_targets = [
            get_target_from_url(url_path_join(self.gpg_key_dir, recipient))
            for recipient in recipients
        ]

        path_task = PathSetTask([self.course_files_url], ['*.*'])
        with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory:
            for target in path_task.output():
                with target.open('r') as input_file:
                    # Get path without urlscheme.
                    course_files_path = urlparse.urlparse(self.course_files_url).path
                    # Calculates target's relative path to course_files_path by getting the substring that
                    # occurs after course_files_path substring in target's path.
                    # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget.
                    # Examples:
                    # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz
                    # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz
                    # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql
                    r_index = target.path.find(course_files_path) + len(course_files_path)
                    relative_path = target.path[r_index:].lstrip('/')

                    local_file_path = os.path.join(tmp_directory, relative_path)
                    try:
                        os.makedirs(os.path.dirname(local_file_path))
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                    with open(local_file_path, 'w') as temp_file:
                        copy_file_to_file(input_file, temp_file)

            def report_encrypt_progress(num_bytes):
                """Log encryption progress."""
                log.info('Encrypted %d bytes', num_bytes)

            with self.output().open('w') as output_file:
                with make_encrypted_file(
                    output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir
                ) as encrypted_output_file:
                    with tarfile.open(mode='w:gz', fileobj=encrypted_output_file) as output_archive_file:
                        output_archive_file.add(tmp_directory, arcname='')
コード例 #31
0
 def complete(self):
     return get_target_from_url(url_path_join(self.output_root,
                                              '_SUCCESS')).exists()
コード例 #32
0
 def output(self):
     return get_target_from_url(url_path_join(self.output_root, 'event_type_distribution/'))
コード例 #33
0
 def output(self):
     return get_target_from_url(self.output_root)
コード例 #34
0
 def output(self):
     return get_target_from_url(self.course_country_output + "/")