def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = ScalableS3Client().s3 for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def requires(self): yield ExternalURL(url=self.vertica_credentials) yield ExternalURL(url=self.gcp_credentials) if self.bigquery_dataset is None: self.bigquery_dataset = self.vertica_schema_name intermediate_warehouse_path = url_path_join(self.s3_warehouse_path, 'import/vertica/sqoop/') query = "SELECT table_name FROM all_tables WHERE schema_name='{schema_name}' AND table_type='TABLE' " \ "".format(schema_name=self.vertica_schema_name) table_list = [ row[0] for row in get_vertica_results(self.vertica_credentials, query) ] for table_name in table_list: if not self.should_exclude_table(table_name): yield LoadVerticaTableToBigQuery( date=self.date, overwrite=self.overwrite, intermediate_warehouse_path=intermediate_warehouse_path, dataset_id=self.bigquery_dataset, credentials=self.gcp_credentials, max_bad_records=self.max_bad_records, table_name=table_name, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, exclude=self.exclude, )
def get_vertica_results(credentials, query): """Run a single query in Vertica and return the results.""" credentials_target = ExternalURL(url=credentials).output() cred = None with credentials_target.open('r') as credentials_file: cred = json.load(credentials_file) # Externalize autocommit and read timeout connection = vertica_python.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'), port=cred.get('port'), database='warehouse', autocommit=False, read_timeout=None) if not vertica_client_available: raise ImportError('Vertica client library not available') try: cursor = connection.cursor() cursor.execute(query) results = cursor.fetchall() finally: connection.close() return results
def requires(self): # The end date is not included in the result, so we have to add a day # to the provided date in order to ensure user registration data is # gathered for that date. end_date = self.date + timedelta(1) # In order to compute the cumulative sum of user registrations we need # all changes in registrations up to (and including) the provided date. registrations = UserRegistrationsPerDay(credentials=self.credentials, destination=self.destination, date_interval=Custom( MINIMUM_DATE, end_date)) results = { 'enrollments': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks), 'registrations': registrations } if self.offsets: results.update({'offsets': ExternalURL(self.offsets)}) if self.history: results.update({'history': ExternalURL(self.history)}) if self.blacklist: results.update({'blacklist': ExternalURL(self.blacklist)}) return results
def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), 'source_script': ExternalURL(url=self.source_script), } if self.depends_on is not None: self.required_tasks['depends_on'] = self.depends_on return self.required_tasks
def requires_hadoop(self): # Check first if running locally with Sqoop output. target = get_target_from_url(self.source_dir) if isinstance(target, luigi.LocalTarget) and os.path.isdir( self.source_dir): files = [ f for f in os.listdir(self.source_dir) if f.startswith("part") ] for filename in files: yield ExternalURL(url_path_join(self.source_dir, filename)) else: yield ExternalURL(self.source_dir)
def __init__(self, *args, **kwargs): super(ImportMysqlDatabaseToBigQueryDatasetTask, self).__init__(*args, **kwargs) self.table_list = [] self.is_complete = False self.required_tasks = None # If we are overwriting the database output, then delete the entire marker table. # That way, any future activity on it should only consist of inserts, rather than any deletes # of existing marker entries. There are quotas on deletes and upserts on a table, of no more # than 96 per day. This allows us to work around hitting those limits. # Note that we have to do this early, before scheduling begins, so that no entries are present # when scheduling occurs (so everything gets properly scheduled). if self.overwrite: # First, create a BigQueryTarget object, so we can connect to BigQuery. This is only # for the purpose of deleting the marker table, so use dummy values. credentials_target = ExternalURL(url=self.credentials).output() target = BigQueryTarget( credentials_target=credentials_target, dataset_id=self.dataset_id, table="dummy_table", update_id="dummy_id", ) # Now ask it to delete the marker table completely. target.delete_marker_table()
def insert_source_task(self): partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec url_with_filename = url_path_join(self.warehouse_path, "course_catalog", "subjects", partition_path_spec, "subjects.tsv") return ExternalURL(url=url_with_filename)
def requires(self): """Require the external config if we are not using the default one""" reqs = super(XBlockConfigMixin, self).requires() if os.path.basename(self.xblock_obfuscation_config ) != self.xblock_obfuscation_config: reqs['xblock_config'] = ExternalURL(self.xblock_obfuscation_config) return reqs
def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), 'insert_source': self.insert_source_task } return self.required_tasks
def requires(self): credentials_target = ExternalURL(url=self.google_credentials).output() gs = create_google_spreadsheet_client(credentials_target) for spreadsheet_key, config in self.spreadsheets_config.items(): schema = config['schema'] scratch_schema = config['scratch_schema'] database = config['database'] column_types_row = config.get('column_types_row', False) spreadsheet = gs.open_by_key(spreadsheet_key) worksheets = spreadsheet.worksheets() for worksheet in worksheets: yield LoadWorksheetToSnowflake( date=self.date, # Snowflake-related params. credentials=self.sf_credentials, run_id=self.sf_run_id, sf_database=database, schema=schema, scratch_schema=scratch_schema, warehouse=self.sf_warehouse, role=self.sf_role, overwrite=self.overwrite, # Google-related params. google_credentials=self.google_credentials, spreadsheet_key=spreadsheet_key, worksheet_name=worksheet.title, column_types_row=column_types_row, )
def requires(self): paypal_interval = date_interval.Custom(self.paypal_interval_start, self.import_date) for date in paypal_interval: url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), 'paypal.tsv') yield ExternalURL(url=url)
def insert_source_task(self): hive_table = "user_activity_by_user" # User activity data for each day is stored in a dated directory. # We want to be able to load all that data into Vertica in one go, hence we use # a wildcard('*') here. url = url_path_join(self.warehouse_path, hive_table) + '/dt=*/' return ExternalURL(url=url)
def requires_local(self): results = super(ObfuscateCourseEventsTask, self).requires_local() if os.path.basename(self.explicit_event_whitelist ) != self.explicit_event_whitelist: results['explicit_events'] = ExternalURL( url=self.explicit_event_whitelist) return results
def requires(self): return {'insert_source': LoadInternalReportingUserActivityToWarehouse( n_reduce_tasks=self.n_reduce_tasks, date=self.date, warehouse_path=self.warehouse_path, overwrite=self.overwrite, schema=self.schema, credentials=self.credentials), 'credentials': ExternalURL(self.credentials)}
def requires(self): if self._required_tasks is None: self._required_tasks = { 'credentials': ExternalURL(url=self.vertica_credentials), 'sqoop_dump_vertica_table_task': self.sqoop_dump_vertica_table_task, } return self._required_tasks
def requires_local(self): """Adds geolocation_data as a local requirement.""" result = super(GeolocationMixin, self).requires_local() # Default is an empty list, but assume that any real data added is done # so as a dict. if not result: result = {} result['geolocation_data'] = ExternalURL(self.geolocation_data) return result
def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), } if not self.insert_source_task_dynamically: self.required_tasks['insert_source'] = self.insert_source_task return self.required_tasks
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def get_downstream_task(self): # If no downstream task has been set, load our configuration and generate our tasks and dependency chain. if self.downstream_task is None: script_conf_target = ExternalURL( url=self.script_configuration).output() with script_conf_target.open('r') as script_conf_file: config = yaml.safe_load(script_conf_file) if config is not None and isinstance(config, dict): previous_task = None scripts = config.get('scripts', []) # Iterate over the list of scripts in the configuration file in reverse order. We also zip a list of integers, # representing the zero-based index position of the given script in the overall list. We iterate in reverse # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended # order: from the top of the file, downwards. for script in scripts: if not self.validate_script_entry(script): log.warn("encountered invalid script entry!") continue new_task = RunVerticaSqlScriptTask( credentials=self.credentials, schema=self.schema, marker_schema=self.marker_schema, date=self.date, read_timeout=self.read_timeout, source_script=path.join(self.script_root, script['location']), script_name=script.get('name')) # If we previously configured a task, set it as a dependency of this one, so it runs prior to. if previous_task is not None: new_task.add_dependency(previous_task) # Mark this as the previously-created task. previous_task = new_task self.downstream_task = previous_task # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts. if self.downstream_task is not None: yield self.downstream_task
def requires(self): results = { 'source': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks ) } if self.offsets: results.update({'offsets': ExternalURL(self.offsets)}) if self.statuses: results.update({'statuses': ExternalURL(self.statuses)}) return results
def output(self): # TODO: Once VerticaCopyTask handles multiple input files update this # to use the outputs of the sub-jobs instead of always returning all # files. # Affiliate Window reports for each day are stored in dated directories. # We want to be able to load all that data into Vertica in one go, hence we use # a wildcard('*') here. url = url_path_join(self.warehouse_path, 'fees', 'affiliate_window') + '/dt=*/' return ExternalURL(url=url).output()
def requires(self): yield ExternalURL(url=self.vertica_credentials) yield ExternalURL(url=self.gcp_credentials) if self.bigquery_dataset is None: self.bigquery_dataset = self.vertica_schema_name for table_name in self.get_table_list_for_schema(): yield LoadVerticaTableFromS3ToBigQueryTask( date=self.date, overwrite=self.overwrite, intermediate_warehouse_path=self.intermediate_warehouse_path, dataset_id=self.bigquery_dataset, credentials=self.gcp_credentials, max_bad_records=self.max_bad_records, table_name=table_name, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, )
def requires(self): return { 'source': LoadWarehouseTask(date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, n_reduce_tasks=self.n_reduce_tasks), 'credentials': ExternalURL(self.credentials) }
def get_downstream_task(self): # If no downstream task has been set, load our configuration and generate our tasks and dependency chain. if self.downstream_task is None: script_conf_target = ExternalURL(url=self.script_configuration).output() with script_conf_target.open('r') as script_conf_file: config = yaml.safe_load(script_conf_file) if config is not None and isinstance(config, dict): previous_task = None scripts = config.get('scripts', []) # Iterate over the list of scripts in the configuration file in reverse order. We also zip a list of integers, # representing the zero-based index position of the given script in the overall list. We iterate in reverse # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended # order: from the top of the file, downwards. for script in scripts: if not self.validate_script_entry(script): log.warn("encountered invalid script entry!") continue new_task = RunVerticaSqlScriptTask( credentials=self.credentials, schema=self.schema, marker_schema=self.marker_schema, date=self.date, read_timeout=self.read_timeout, source_script=path.join(self.script_root, script['location']), script_name=script.get('name')) # If we previously configured a task, set it as a dependency of this one, so it runs prior to. if previous_task is not None: new_task.add_dependency(previous_task) # Mark this as the previously-created task. previous_task = new_task self.downstream_task = previous_task # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts. if self.downstream_task is not None: yield self.downstream_task
def get_mysql_query_results(credentials, database, query): """ Executes a mysql query on the provided database and returns the results. """ credentials_target = ExternalURL(url=credentials).output() cred = None with credentials_target.open('r') as credentials_file: cred = json.load(credentials_file) connection = mysql.connector.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'), port=cred.get('port'), database=database) try: cursor = connection.cursor() cursor.execute(query) results = cursor.fetchall() finally: connection.close() return results
def insert_source_task(self): """ We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask workflow, so we specify ExternalURL here instead. In the future we can change this to a SqoopImportFromVertica task. """ partition_path_spec = HivePartition('dt', self.date).path_spec intermediate_warehouse_path = url_path_join(self.warehouse_path, 'import/vertica/sqoop/') url = url_path_join(intermediate_warehouse_path, self.vertica_warehouse_name, self.vertica_schema_name, self.table_name, partition_path_spec) + '/' return ExternalURL(url=url)
def requires(self): yield ExternalURL(url=self.vertica_credentials) for table_name in self.get_table_list_for_schema(): yield ExportVerticaTableToS3Task( date=self.date, overwrite=self.overwrite, table_name=table_name, intermediate_warehouse_path=self.intermediate_warehouse_path, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, sqoop_null_string=self.sqoop_null_string, sqoop_fields_terminated_by=self.sqoop_fields_terminated_by, sqoop_delimiter_replacement=self.sqoop_delimiter_replacement, )
def requires(self): config = get_config() for merchant_id in self.cybersource_merchant_ids: section_name = 'cybersource:' + merchant_id interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start')) interval_end = self.import_date merchant_close_date = config.get(section_name, 'merchant_close_date', '') if merchant_close_date: parsed_date = luigi.DateParameter().parse(merchant_close_date) interval_end = min(self.import_date, parsed_date) cybersource_interval = date_interval.Custom(interval_start, interval_end) for date in cybersource_interval: filename = "cybersource_{}.tsv".format(merchant_id) url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename) yield ExternalURL(url=url)
def requires(self): results = { 'events': ProblemCheckEvent( mapreduce_engine=self.mapreduce_engine, input_format=self.base_input_format, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks, name=self.name, src=self.src, dest=self.dest, include=self.include, manifest=self.manifest, ), } if self.answer_metadata: results.update({'answer_metadata': ExternalURL(self.answer_metadata)}) return results
def requires(self): credentials_target = ExternalURL(url=self.google_credentials).output() gs = create_google_spreadsheet_client(credentials_target) for spreadsheet_key, config in self.spreadsheets_config.items(): schema = config['schema'] column_types_row = config.get('column_types_row', False) spreadsheet = gs.open_by_key(spreadsheet_key) worksheets = spreadsheet.worksheets() for worksheet in worksheets: yield LoadWorksheetToVertica( date=self.date, schema=schema, google_credentials=self.google_credentials, spreadsheet_key=spreadsheet_key, worksheet_name=worksheet.title, column_types_row=column_types_row, overwrite=self.overwrite, )