def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), 'insert_source_task': self.insert_source_task, } return self.required_tasks
def get_vertica_results(warehouse_name, credentials, query): """Run a single query in Vertica and return the results.""" credentials_target = ExternalURL(url=credentials).output() cred = None with credentials_target.open('r') as credentials_file: cred = json.load(credentials_file) # Externalize autocommit and read timeout connection = vertica_python.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'), port=cred.get('port'), database=warehouse_name, autocommit=False, read_timeout=None) if not vertica_client_available: raise ImportError('Vertica client library not available') try: cursor = connection.cursor() cursor.execute(query) results = cursor.fetchall() finally: connection.close() return results
def requires_local(self): results = super(ObfuscateCourseEventsTask, self).requires_local() if os.path.basename(self.explicit_event_whitelist ) != self.explicit_event_whitelist: results['explicit_events'] = ExternalURL( url=self.explicit_event_whitelist) return results
def requires(self): if self._required_tasks is None: self._required_tasks = { 'credentials': ExternalURL(url=self.vertica_credentials), 'sqoop_dump_vertica_table_task': self.sqoop_dump_vertica_table_task, } return self._required_tasks
def requires(self): if self.required_tasks is None: self.required_tasks = { 'credentials': ExternalURL(url=self.credentials), } if not self.insert_source_task_dynamically: self.required_tasks['insert_source'] = self.insert_source_task return self.required_tasks
def requires_local(self): """Adds geolocation_data as a local requirement.""" result = super(GeolocationMixin, self).requires_local() # Default is an empty list, but assume that any real data added is done # so as a dict. if not result: result = {} result['geolocation_data'] = ExternalURL(self.geolocation_data) return result
def requires(self): return {'insert_source': LoadInternalReportingUserActivityToWarehouse( n_reduce_tasks=self.n_reduce_tasks, date=self.date, warehouse_path=self.warehouse_path, overwrite=self.overwrite, schema=self.schema, credentials=self.credentials), 'credentials': ExternalURL(self.credentials)}
def manifest_file_list(self): """Write each individual path to a manifest file and yield the path to that file.""" manifest_target = get_target_from_url(self.manifest) if not manifest_target.exists(): with manifest_target.open('w') as manifest_file: for external_url_task in self.generate_file_list(): manifest_file.write(external_url_task.url + '\n') yield ExternalURL(self.manifest)
def requires(self): results = { 'source': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks ) } if self.offsets: results.update({'offsets': ExternalURL(self.offsets)}) if self.statuses: results.update({'statuses': ExternalURL(self.statuses)}) return results
def output(self): # TODO: Once VerticaCopyTask handles multiple input files update this # to use the outputs of the sub-jobs instead of always returning all # files. # Affiliate Window reports for each day are stored in dated directories. # We want to be able to load all that data into Vertica in one go, hence we use # a wildcard('*') here. url = url_path_join(self.warehouse_path, 'fees', 'affiliate_window') + '/dt=*/' return ExternalURL(url=url).output()
def requires(self): return { 'source': LoadWarehouseTask(date=self.date, schema=self.schema, credentials=self.credentials, marker_schema=self.marker_schema, overwrite=self.overwrite, n_reduce_tasks=self.n_reduce_tasks), 'credentials': ExternalURL(self.credentials) }
def requires(self): yield ExternalURL(url=self.vertica_credentials) yield ExternalURL(url=self.gcp_credentials) if self.bigquery_dataset is None: self.bigquery_dataset = self.vertica_schema_name for table_name in self.get_table_list_for_schema(): yield LoadVerticaTableFromS3ToBigQueryTask( date=self.date, overwrite=self.overwrite, intermediate_warehouse_path=self.intermediate_warehouse_path, dataset_id=self.bigquery_dataset, credentials=self.gcp_credentials, max_bad_records=self.max_bad_records, table_name=table_name, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, )
def insert_source_task(self): """ We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask workflow, so we specify ExternalURL here instead. In the future we can change this to a SqoopImportFromVertica task. """ partition_path_spec = HivePartition('dt', self.date).path_spec intermediate_warehouse_path = url_path_join(self.warehouse_path, 'import/vertica/sqoop/') url = url_path_join(intermediate_warehouse_path, self.vertica_warehouse_name, self.vertica_schema_name, self.table_name, partition_path_spec) + '/' return ExternalURL(url=url)
def requires(self): yield ExternalURL(url=self.vertica_credentials) for table_name in self.get_table_list_for_schema(): yield ExportVerticaTableToS3Task( date=self.date, overwrite=self.overwrite, table_name=table_name, intermediate_warehouse_path=self.intermediate_warehouse_path, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, sqoop_null_string=self.sqoop_null_string, sqoop_fields_terminated_by=self.sqoop_fields_terminated_by, sqoop_delimiter_replacement=self.sqoop_delimiter_replacement, )
def requires(self): config = get_config() for merchant_id in self.cybersource_merchant_ids: section_name = 'cybersource:' + merchant_id interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start')) interval_end = self.import_date merchant_close_date = config.get(section_name, 'merchant_close_date', '') if merchant_close_date: parsed_date = luigi.DateParameter().parse(merchant_close_date) interval_end = min(self.import_date, parsed_date) cybersource_interval = date_interval.Custom(interval_start, interval_end) for date in cybersource_interval: filename = "cybersource_{}.tsv".format(merchant_id) url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename) yield ExternalURL(url=url)
def get_downstream_task(self): # If no downstream task has been set, load our configuration and generate our tasks and dependency chain. if self.downstream_task is None: script_conf_target = ExternalURL( url=self.script_configuration).output() with script_conf_target.open('r') as script_conf_file: config = yaml.safe_load(script_conf_file) if config is not None and isinstance(config, dict): previous_task = None scripts = config.get('scripts', []) # Iterate over the list of scripts in the configuration file in reverse order. We also zip a list of integers, # representing the zero-based index position of the given script in the overall list. We iterate in reverse # in order to link each task together, using requires(), to ensure that tasks run sequentially, and in the intended # order: from the top of the file, downwards. for script in scripts: if not self.validate_script_entry(script): log.warn("encountered invalid script entry!") continue new_task = RunVerticaSqlScriptTask( credentials=self.credentials, schema=self.schema, marker_schema=self.marker_schema, date=self.date, read_timeout=self.read_timeout, source_script=path.join(self.script_root, script['location']), script_name=script.get('name')) # If we previously configured a task, set it as a dependency of this one, so it runs prior to. if previous_task is not None: new_task.add_dependency(previous_task) # Mark this as the previously-created task. previous_task = new_task self.downstream_task = previous_task # If a downstream task has been set, yield it, triggering Luigi to schedule our scripts. if self.downstream_task is not None: yield self.downstream_task
def requires(self): results = { 'events': ProblemCheckEvent( mapreduce_engine=self.mapreduce_engine, input_format=self.base_input_format, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks, name=self.name, src=self.src, dest=self.dest, include=self.include, manifest=self.manifest, ), } if self.answer_metadata: results.update({'answer_metadata': ExternalURL(self.answer_metadata)}) return results
def requires(self): credentials_target = ExternalURL(url=self.google_credentials).output() gs = create_google_spreadsheet_client(credentials_target) for spreadsheet_key, config in self.spreadsheets_config.items(): schema = config['schema'] column_types_row = config.get('column_types_row', False) spreadsheet = gs.open_by_key(spreadsheet_key) worksheets = spreadsheet.worksheets() for worksheet in worksheets: yield LoadWorksheetToVertica( date=self.date, schema=schema, google_credentials=self.google_credentials, spreadsheet_key=spreadsheet_key, worksheet_name=worksheet.title, column_types_row=column_types_row, overwrite=self.overwrite, )
def requires(self): end_date = self.date + timedelta(1) results = { 'enrollments': CourseEnrollmentChangesPerDay( name=self.name, src=self.src, dest=self.destination, include=self.include, manifest=self.manifest, mapreduce_engine=self.mapreduce_engine, lib_jar=self.lib_jar, n_reduce_tasks=self.n_reduce_tasks), 'registrations': UserRegistrationsPerDay(credentials=self.credentials, destination=self.destination, date_interval=Custom( MINIMUM_DATE, end_date)), } if self.blacklist: results.update({'blacklist': ExternalURL(self.blacklist)}) return results
def requires(self): yield ExternalURL(url=self.vertica_credentials) for table_name in self.get_table_list_for_schema(): yield LoadVerticaTableFromS3ToSnowflakeTask( date=self.date, overwrite=self.overwrite, intermediate_warehouse_path=self.intermediate_warehouse_path, credentials=self.credentials, warehouse=self.warehouse, role=self.role, sf_database=self.sf_database, schema=self.schema, scratch_schema=self.scratch_schema, run_id=self.run_id, table_name=table_name, vertica_schema_name=self.vertica_schema_name, vertica_warehouse_name=self.vertica_warehouse_name, vertica_credentials=self.vertica_credentials, sqoop_null_string=self.sqoop_null_string, sqoop_fields_terminated_by=self.sqoop_fields_terminated_by, )
def get_mysql_query_results(credentials, database, query): """ Executes a mysql query on the provided database and returns the results. """ credentials_target = ExternalURL(url=credentials).output() cred = None with credentials_target.open('r') as credentials_file: cred = json.load(credentials_file) connection = mysql.connector.connect(user=cred.get('username'), password=cred.get('password'), host=cred.get('host'), port=cred.get('port'), database=database) try: cursor = connection.cursor() cursor.execute(query) results = cursor.fetchall() finally: connection.close() return results
def __init__(self, *args, **kwargs): super(ImportMysqlDatabaseToBigQueryDatasetTask, self).__init__(*args, **kwargs) self.table_list = [] self.is_complete = False self.required_tasks = None # If we are overwriting the database output, then delete the entire marker table. # That way, any future activity on it should only consist of inserts, rather than any deletes # of existing marker entries. There are quotas on deletes and upserts on a table, of no more # than 96 per day. This allows us to work around hitting those limits. # Note that we have to do this early, before scheduling begins, so that no entries are present # when scheduling occurs (so everything gets properly scheduled). if self.overwrite: # First, create a BigQueryTarget object, so we can connect to BigQuery. This is only # for the purpose of deleting the marker table, so use dummy values. credentials_target = ExternalURL(url=self.credentials).output() target = BigQueryTarget( credentials_target=credentials_target, dataset_id=self.dataset_id, table="dummy_table", update_id="dummy_id", ) # Now ask it to delete the marker table completely. target.delete_marker_table()
def insert_source_task(self): hive_table = "internal_reporting_user_activity" partition_location = url_path_join(self.warehouse_path, hive_table, self.partition.path_spec) + '/' return ExternalURL(url=partition_location)
def requires_local(self): return ExternalURL(url=self.events_list_file_path)
def requires(self): return { 'credentials': ExternalURL(url=self.credentials), }
def requires_local(self): return ExternalURL(url=self.config)
def requires(self): yield self.hive_table_task yield ExternalURL( url=url_path_join(self.warehouse_path, 'course_enrollment_summary', self.partition.path_spec) + '/' )
def insert_source_task(self): url = url_path_join( self.hive_partition_path('course_subject', self.date), 'course_subject.tsv') return ExternalURL(url=url)
def insert_source_task(self): return ExternalURL(url=self.hive_partition_path( 'internal_reporting_d_country', self.date))
def insert_source_task(self): return ExternalURL(url=self.hive_partition_path( 'course_enrollment_summary', self.date))