Python ExternalURL Exemples, edx.analytics.tasks.url.ExternalURL Python Exemples

Exemple #1

0

Afficher le fichier

    def requires(self):
        # The end date is not included in the result, so we have to add a day
        # to the provided date in order to ensure user registration data is
        # gathered for that date.
        end_date = self.date + timedelta(1)

        # In order to compute the cumulative sum of user registrations we need
        # all changes in registrations up to (and including) the provided date.
        registrations = UserRegistrationsPerDay(credentials=self.credentials,
                                                destination=self.destination,
                                                date_interval=Custom(
                                                    MINIMUM_DATE, end_date))

        results = {
            'enrollments':
            CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks),
            'registrations':
            registrations
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.history:
            results.update({'history': ExternalURL(self.history)})
        if self.blacklist:
            results.update({'blacklist': ExternalURL(self.blacklist)})

        return results

Exemple #2

0

Afficher le fichier

 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include,
                     self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src,
                                                    recursive=True,
                                                    include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(
                         fnmatch.fnmatch(source, include_val)
                         for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)

Exemple #3

0

Afficher le fichier

Fichier : enrollment_validation.py Projet : open-craft/edx-analytics-pipeline

 def requires_hadoop(self):
     # Check first if running locally with Sqoop output.
     target = get_target_from_url(self.source_dir)
     if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir):
         files = [f for f in os.listdir(self.source_dir) if f.startswith("part")]
         for filename in files:
             yield ExternalURL(url_path_join(self.source_dir, filename))
     else:
         yield ExternalURL(self.source_dir)

Exemple #4

0

Afficher le fichier

Fichier : data_obfuscation.py Projet : npoed/edx-analytics-pipeline

 def requires(self):
     """Require the external config if we are not using the default one"""
     reqs = super(XBlockConfigMixin, self).requires()
     if os.path.basename(self.xblock_obfuscation_config
                         ) != self.xblock_obfuscation_config:
         reqs['xblock_config'] = ExternalURL(self.xblock_obfuscation_config)
     return reqs

Exemple #5

0

Afficher le fichier

Fichier : vertica_load.py Projet : jrowan/edx-analytics-pipeline

 def requires(self):
     if self.required_tasks is None:
         self.required_tasks = {
             'credentials': ExternalURL(url=self.credentials),
             'insert_source': self.insert_source_task
         }
     return self.required_tasks

Exemple #6

0

Afficher le fichier

 def requires(self):
     return {'insert_source': LoadInternalReportingUserActivityToWarehouse(
         n_reduce_tasks=self.n_reduce_tasks,
         date=self.date,
         warehouse_path=self.warehouse_path,
         overwrite=self.overwrite,
         schema=self.schema,
         credentials=self.credentials),
         'credentials': ExternalURL(self.credentials)}

Exemple #7

0

Afficher le fichier

Fichier : geolocation.py Projet : npoed/edx-analytics-pipeline

 def requires_local(self):
     """Adds geolocation_data as a local requirement."""
     result = super(GeolocationMixin, self).requires_local()
     # Default is an empty list, but assume that any real data added is done
     # so as a dict.
     if not result:
         result = {}
     result['geolocation_data'] = ExternalURL(self.geolocation_data)
     return result

Exemple #8

0

Afficher le fichier

    def manifest_file_list(self):
        """Write each individual path to a manifest file and yield the path to that file."""
        manifest_target = get_target_from_url(self.manifest)
        if not manifest_target.exists():
            with manifest_target.open('w') as manifest_file:
                for external_url_task in self.generate_file_list():
                    manifest_file.write(external_url_task.url + '\n')

        yield ExternalURL(self.manifest)

Exemple #9

0

Afficher le fichier

    def requires(self):
        results = {
            'source': CourseEnrollmentChangesPerDay(
                name=self.name,
                src=self.src,
                dest=self.destination,
                include=self.include,
                manifest=self.manifest,
                mapreduce_engine=self.mapreduce_engine,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks
            )
        }
        if self.offsets:
            results.update({'offsets': ExternalURL(self.offsets)})
        if self.statuses:
            results.update({'statuses': ExternalURL(self.statuses)})

        return results

Exemple #10

0

Afficher le fichier

 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = boto.connect_s3()
             for _bucket, _root, path in generate_s3_sources(
                     self.s3_conn, src, self.include):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(
                             fnmatch.fnmatch(relpath, include_val)
                             for include_val in self.include):
                         yield ExternalURL(filepath)

Exemple #11

0

Afficher le fichier

 def requires(self):
     return {
         'source':
         LoadWarehouseTask(date=self.date,
                           schema=self.schema,
                           credentials=self.credentials,
                           marker_schema=self.marker_schema,
                           overwrite=self.overwrite,
                           n_reduce_tasks=self.n_reduce_tasks),
         'credentials':
         ExternalURL(self.credentials)
     }

Exemple #12

0

Afficher le fichier

def get_mysql_query_results(credentials, database, query):
    """
    Executes a mysql query on the provided database and returns the results.
    """

    credentials_target = ExternalURL(url=credentials).output()
    cred = None
    with credentials_target.open('r') as credentials_file:
        cred = json.load(credentials_file)

    connection = mysql.connector.connect(user=cred.get('username'),
                                         password=cred.get('password'),
                                         host=cred.get('host'),
                                         port=cred.get('port'),
                                         database=database)

    try:
        cursor = connection.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
    finally:
        connection.close()

    return results

Exemple #13

0

Afficher le fichier

    def requires(self):
        results = {
            'events': ProblemCheckEvent(
                mapreduce_engine=self.mapreduce_engine,
                input_format=self.base_input_format,
                lib_jar=self.lib_jar,
                n_reduce_tasks=self.n_reduce_tasks,
                name=self.name,
                src=self.src,
                dest=self.dest,
                include=self.include,
                manifest=self.manifest,
            ),
        }

        if self.answer_metadata:
            results.update({'answer_metadata': ExternalURL(self.answer_metadata)})

        return results

Exemple #14

0

Afficher le fichier

Fichier : incremental_enrollments.py Projet : rogeriofalcone/edx-analytics-pipeline

 def requires(self):
     end_date = self.date + timedelta(1)
     results = {
         'enrollments':
         CourseEnrollmentChangesPerDay(
             name=self.name,
             src=self.src,
             dest=self.destination,
             include=self.include,
             manifest=self.manifest,
             mapreduce_engine=self.mapreduce_engine,
             lib_jar=self.lib_jar,
             n_reduce_tasks=self.n_reduce_tasks),
         'registrations':
         UserRegistrationsPerDay(credentials=self.credentials,
                                 destination=self.destination,
                                 date_interval=Custom(
                                     MINIMUM_DATE, end_date)),
     }
     if self.blacklist:
         results.update({'blacklist': ExternalURL(self.blacklist)})
     return results

Exemple #15

0

Afficher le fichier

 def requires(self):
     return {'credentials': ExternalURL(self.credentials)}

Exemple #16

0

Afficher le fichier

Fichier : load_internal_reporting_user_course.py Projet : hks-epod/edx-analytics-pipeline

 def insert_source_task(self):
     hive_table = "course_enrollment"
     partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)

Exemple #17

0

Afficher le fichier

 def requires_local(self):
     return ExternalURL(self.geolocation_data)

Exemple #18

0

Afficher le fichier

 def insert_source_task(self):
     return ExternalURL(url=self.insert_source)

Exemple #19

0

Afficher le fichier

Fichier : load_internal_reporting_country.py Projet : linearregression/edx-analytics-pipeline

 def insert_source_task(self):
     hive_table = "internal_reporting_d_country"
     partition_location = url_path_join(self.warehouse_path, hive_table,
                                        self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)

Exemple #20

0

Afficher le fichier

 def requires(self):
     return ExternalURL(
         url_path_join(self.course_files_url, 'metadata_file.json'))

Exemple #21

0

Afficher le fichier

Fichier : enrollments.py Projet : linearregression/edx-analytics-pipeline

 def requires(self):
     yield ExternalURL(
         url=url_path_join(self.warehouse_path, 'course_enrollment',
                           self.partition.path_spec) + '/')

Exemple #22

0

Afficher le fichier

Fichier : location_per_course.py Projet : hks-epod/edx-analytics-pipeline

 def requires(self):
     yield ExternalURL(
         url=url_path_join(self.warehouse_path, 'last_country_of_user', self.partition_spec()) + '/'
     )

Exemple #23

0

Afficher le fichier

Fichier : load_internal_reporting_user_course.py Projet : npoed/edx-analytics-pipeline

 def insert_source_task(self):
     return ExternalURL(url=self.hive_partition_path('course_enrollment_summary', self.date))

Exemple #24

0

Afficher le fichier

 def requires(self):
     results = {
         'events': PathSetTask(self.src, self.include, self.manifest),
         'geoloc_data': ExternalURL(self.geolocation_data),
     }
     return results

Exemple #25

0

Afficher le fichier

Fichier : events_obfuscation.py Projet : npoed/edx-analytics-pipeline

    def requires_local(self):
        results = super(ObfuscateCourseEventsTask, self).requires_local()

        if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist:
            results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist)
        return results

Exemple #26

0

Afficher le fichier

Fichier : location_per_course.py Projet : npoed/edx-analytics-pipeline

 def data_task(self):
     url = super(ExternalLastCountryOfUserToHiveTask,
                 self).data_task.output().path
     return ExternalURL(url.rstrip('/') + '/')

Exemple #27

0

Afficher le fichier

Fichier : enrollment_validation.py Projet : open-craft/edx-analytics-pipeline

 def input_hadoop(self):
     # The requires() method will return the Hive target, but really we need the
     # file dump instead.
     yield ExternalURL(self.source_dir).output()

Exemple #28

0

Afficher le fichier

Fichier : event_exports.py Projet : linearregression/edx-analytics-pipeline

 def requires_local(self):
     return ExternalURL(url=self.config)

Exemple #29

0

Afficher le fichier

 def requires(self):
     return ExternalURL(self.counts)

Exemple #30

0

Afficher le fichier

Fichier : events_obfuscation.py Projet : linearregression/edx-analytics-pipeline

 def requires_local(self):
     results = {}
     if os.path.basename(self.explicit_event_whitelist) != self.explicit_event_whitelist:
         results['explicit_events'] = ExternalURL(url=self.explicit_event_whitelist)
     results['geolocation_data'] = ExternalURL(self.geolocation_data)
     return results