Exemple #1
0
    def requires(self):
        # The end date is not included in the result, so we have to add a day
        # to the provided date in order to ensure user registration data is
        # gathered for that date.
        end_date = self.date + timedelta(1)

        # In order to compute the cumulative sum of user registrations we need
        # all changes in registrations up to (and including) the provided date.
        registrations = UserRegistrationsPerDay(credentials=self.credentials,
                                                destination=self.destination,
                                                date_interval=Custom(
                                                    MINIMUM_DATE, end_date))

        results = {
            'enrollments':
            CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks),
            'registrations':
            registrations
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.history:
            results.update({'history': ExternalURL(self.history)})
        if self.blacklist:
            results.update({'blacklist': ExternalURL(self.blacklist)})

        return results
Exemple #2
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include,
                     self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src,
                                                    recursive=True,
                                                    include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(
                         fnmatch.fnmatch(source, include_val)
                         for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)
 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir):
         files = [f for f in os.listdir(self.source_dir) if f.startswith("part")]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)
 def requires(self):
     """Require the external config if we are not using the default one"""
     reqs = super(XBlockConfigMixin, self).requires()
     if os.path.basename(self.xblock_obfuscation_config
                         ) != self.xblock_obfuscation_config:
         reqs['xblock_config'] = ExternalURL(self.xblock_obfuscation_config)
     return reqs
 def requires(self):
     if self.required_tasks is None:
         self.required_tasks = {
             'credentials': ExternalURL(url=self.credentials),
             'insert_source': self.insert_source_task
         }
     return self.required_tasks
Exemple #6
0
 def requires(self):
     return {'insert_source': LoadInternalReportingUserActivityToWarehouse(
         n_reduce_tasks=self.n_reduce_tasks,
         date=self.date,
         warehouse_path=self.warehouse_path,
         overwrite=self.overwrite,
         schema=self.schema,
         credentials=self.credentials),
         'credentials': ExternalURL(self.credentials)}
 def requires_local(self):
     """Adds geolocation_data as a local requirement."""
     result = super(GeolocationMixin, self).requires_local()
     # Default is an empty list, but assume that any real data added is done
     # so as a dict.
     if not result:
         result = {}
     result['geolocation_data'] = ExternalURL(self.geolocation_data)
     return result
Exemple #8
0
    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)
Exemple #9
0
    def requires(self):
        results = {
            'source': CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks
            )
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.statuses:
            results.update({'statuses': ExternalURL(self.statuses)})

        return results
Exemple #10
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)
Exemple #11
0
 def requires(self):
     return {
         'source':
         LoadWarehouseTask(date=self.date,
                           schema=self.schema,
                           credentials=self.credentials,
                           marker_schema=self.marker_schema,
                           overwrite=self.overwrite,
                           n_reduce_tasks=self.n_reduce_tasks),
         'credentials':
         ExternalURL(self.credentials)
     }
Exemple #12
0
def get_mysql_query_results(credentials, database, query):
    """
    Executes a mysql query on the provided database and returns the results.
    """

    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    connection = mysql.connector.connect(user=cred.get('username'),
                                         password=cred.get('password'),
                                         host=cred.get('host'),
                                         port=cred.get('port'),
                                         database=database)

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results
Exemple #13
0
    def requires(self):
        results = {
            'events': ProblemCheckEvent(
                mapreduce_engine=self.mapreduce_engine,
                input_format=self.base_input_format,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks,
                name=self.name,
                src=self.src,
                dest=self.dest,
                include=self.include,
                manifest=self.manifest,
            ),
        }

        if self.answer_metadata:
            results.update({'answer_metadata': ExternalURL(self.answer_metadata)})

        return results
 def requires(self):
     end_date = self.date + timedelta(1)
     results = {
         'enrollments':
         CourseEnrollmentChangesPerDay(
             name=self.name,
             src=self.src,
             dest=self.destination,
             include=self.include,
             manifest=self.manifest,
             mapreduce_engine=self.mapreduce_engine,
             lib_jar=self.lib_jar,
             n_reduce_tasks=self.n_reduce_tasks),
         'registrations':
         UserRegistrationsPerDay(credentials=self.credentials,
                                 destination=self.destination,
                                 date_interval=Custom(
                                     MINIMUM_DATE, end_date)),
     }
     if self.blacklist:
         results.update({'blacklist': ExternalURL(self.blacklist)})
     return results
Exemple #15
0
 def requires(self):
     return {'credentials': ExternalURL(self.credentials)}
 def insert_source_task(self):
     hive_table = "course_enrollment"
     partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)
Exemple #17
0
 def requires_local(self):
     return ExternalURL(self.geolocation_data)
Exemple #18
0
 def insert_source_task(self):
     return ExternalURL(url=self.insert_source)
 def insert_source_task(self):
     hive_table = "internal_reporting_d_country"
     partition_location = url_path_join(self.warehouse_path, hive_table,
                                        self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)
Exemple #20
0
 def requires(self):
     return ExternalURL(
         url_path_join(self.course_files_url, 'metadata_file.json'))
 def requires(self):
     yield ExternalURL(
         url=url_path_join(self.warehouse_path, 'course_enrollment',
                           self.partition.path_spec) + '/')
 def requires(self):
     yield ExternalURL(
         url=url_path_join(self.warehouse_path, 'last_country_of_user', self.partition_spec()) + '/'
     )
 def insert_source_task(self):
     return ExternalURL(url=self.hive_partition_path('course_enrollment_summary', self.date))
Exemple #24
0
 def requires(self):
     results = {
         'events': PathSetTask(self.src, self.include, self.manifest),
         'geoloc_data': ExternalURL(self.geolocation_data),
     }
     return results
    def requires_local(self):
        results = super(ObfuscateCourseEventsTask, self).requires_local()

        if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist:
            results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist)
        return results
 def data_task(self):
     url = super(ExternalLastCountryOfUserToHiveTask,
                 self).data_task.output().path
     return ExternalURL(url.rstrip('/') + '/')
 def input_hadoop(self):
     # The requires() method will return the Hive target, but really we need the
     # file dump instead.
     yield ExternalURL(self.source_dir).output()
 def requires_local(self):
     return ExternalURL(url=self.config)
Exemple #29
0
 def requires(self):
     return ExternalURL(self.counts)
 def requires_local(self):
     results = {}
     if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist:
         results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist)
     results['geolocation_data'] = ExternalURL(self.geolocation_data)
     return results