class AllState(luigi.WrapperTask): state = luigi.Parameter() def requires(self): for year in STATE_YEARS[self.state]: yield StateFiles(year=year, state=self.state)
class MergeMorphologyBase(luigi.Task): """ MergeMorphology base class """ task_name = 'merge_morphology' src_file = os.path.abspath(__file__) allow_retry = False input_path = luigi.Parameter() input_key = luigi.Parameter() output_path = luigi.Parameter() output_key = luigi.Parameter() number_of_labels = luigi.IntParameter() prefix = luigi.Parameter() # dependency = luigi.TaskParameter() def requires(self): return self.dependency def run_impl(self): # get the global config and init configs shebang = self.global_config_values()[0] self.init(shebang) # load the task config config = self.get_task_config() out_shape = (int(self.number_of_labels), 11) out_chunks = (min(int(self.number_of_labels), 100000), 11) block_list = vu.blocks_in_volume([out_shape[0]], [out_chunks[0]]) # create output dataset with vu.file_reader(self.output_path) as f: f.require_dataset(self.output_key, shape=out_shape, chunks=out_chunks, compression='gzip', dtype='float64') # update the config with input and graph paths and keys # as well as block shape config.update({ 'input_path': self.input_path, 'input_key': self.input_key, 'output_path': self.output_path, 'output_key': self.output_key, 'out_shape': out_shape, 'out_chunks': out_chunks }) # prime and run the jobs n_jobs = min(len(block_list), self.max_jobs) self.prepare_jobs(n_jobs, block_list, config, self.prefix) self.submit_jobs(n_jobs, self.prefix) # wait till jobs finish and check for job success self.wait_for_jobs(self.prefix) self.check_jobs(n_jobs, self.prefix) # part of the luigi API def output(self): return luigi.LocalTarget( os.path.join(self.tmp_folder, self.task_name + '_%s.log' % self.prefix))
class HivePartitionTask(WarehouseMixin, OverwriteOutputMixin, HiveQueryTask): """ Abstract class that represents the metadata associated with a partition in a Hive table. Note that all this task does is ensure that the partition is created, it does not populate it with any data, simply runs the DDL commands to create the partition. """ partition_value = luigi.Parameter() def query(self): if self.overwrite: drop_on_overwrite = 'ALTER TABLE `{table}` DROP IF EXISTS PARTITION ({partition.query_spec});'.format( table=self.hive_table_task.table, partition=self.partition) else: drop_on_overwrite = '' query_format = """ USE {database_name}; {drop_on_overwrite} ALTER TABLE `{table}` ADD IF NOT EXISTS PARTITION ({partition.query_spec}); """ query = query_format.format(database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, drop_on_overwrite=drop_on_overwrite) return textwrap.dedent(query) @property def hive_table_task(self): """Returns a reference to the task that represents the table that this partition is part of.""" raise NotImplementedError @property def data_task(self): """Returns a luigi task that is used to insert real data into this partition.""" return None @property def partition(self): """Returns a HivePartition object that represents the partition.""" return HivePartition(self.hive_table_task.partition_by, self.partition_value) @property def partition_location(self): """Returns the full URL of the partition. This allows data to be written to the partition by external systems""" return url_path_join(self.hive_table_task.table_location, self.partition.path_spec + '/') def requires(self): if self.data_task is not None: yield self.data_task yield self.hive_table_task def output(self): # Ugh. A change in Luigi 1.0.22 (after our 1.0.17 fork) resulted in a change in ApacheHiveCommandClient.table_exists() # behavior, so that it throws an exception when checking for a specific partition when the table doesn't exist. # This means that HivePartitionTarget.exists() will fail, where before it succeeded even if the table did not exist. # So change fail_missing_table=False here. There is no reason for it anyway. return HivePartitionTarget(self.hive_table_task.table, self.partition.as_dict(), database=hive_database_name(), fail_missing_table=False) def job_runner(self): return OverwriteAwareHiveQueryRunner() def remove_output_on_overwrite(self): # Note that the query takes care of actually removing the old partition. if self.overwrite: self.attempted_removal = True
class KickHkmaT0101(MkDir): workdir = luigi.Parameter(default='t0101-monetary-base') def requires(self): extract = ExtractHkma(**self.givedir, urlpath='/T0101.xls') yield extract
class MortarRTask(luigi.Task): """ Luigi Task to run an R script. To use this Task in a pipeline, create a subclass that overrides the methods: * `rscript` * `arguments` seealso:: https://help.mortardata.com/technologies/luigi/r_tasks """ # Location where completion tokens are written # e.g. s3://my-bucket/my-path token_path = luigi.Parameter() def output_token(self): """ Luigi Target providing path to a token that indicates completion of this Task. :rtype: Target: :returns: Target for Task completion token """ return target_factory.get_target( '%s/%s' % (self.token_path, self.__class__.__name__)) def output(self): """ The output for this Task. Returns the output token by default, so the task only runs if the token does not already exist. :rtype: Target: :returns: Target for Task completion token """ return [self.output_token()] @abc.abstractmethod def rscript(self): """ Path to the R script to run, relative to the root of your Mortar project. Ex: If you have two files in your Mortar project: * luigiscripts/my_r_luigiscript.py * rscripts/my_r_script.R You would return: "rscripts/my_r_script.R" :rtype: str: :returns: Path to your R script relative to the root of your Mortar project. e.g. rscripts/my_r_script.R """ raise RuntimeError( "Please implement the rscript method in your MortarRTask to specify which script to run." ) def arguments(self): """ Returns list of arguments to be sent to your R script. :rtype: list of str: :returns: List of arguments to pass to your R script. Default: [] """ return [] def run(self): """ Run an R script using the Rscript program. Pipes stdout and stderr back to the logging facility. """ cmd = self._subprocess_command() output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1) for line in iter(output.stdout.readline, b''): logger.info(line) out, err = output.communicate() rc = output.returncode if rc != 0: raise RuntimeError('%s returned non-zero error code %s' % (self._subprocess_command(), rc)) target_factory.write_file(self.output_token()) def _subprocess_command(self): return "Rscript %s %s" % (self.rscript(), " ".join(self.arguments()))
class ForgotParam(luigi.Task): param = luigi.Parameter() def run(self): pass
class KickHkmaT030502(MkDir): workdir = luigi.Parameter(default='t030502-econ-sector-loans-and-advances') def requires(self): extract = ExtractHkma(**self.givedir, urlpath='/T030502.xls') yield extract
class InsignificantParameterTask(luigi.Task): foo = luigi.Parameter(significant=False) bar = luigi.Parameter()
def testNoValue(self): self.assertRaises(ParameterException, lambda: luigi.Parameter(config_path=dict(section="foo", name="bar")).value)
class ElasticsearchIndexTask(OverwriteOutputMixin, MapReduceJobTask): """ Index a stream of documents in an elasticsearch index. This task is intended to do the following: * Create a new index that is unique to this task run (all significant parameters). * Load all of the documents into this unique index. * If the alias is already pointing at one or more indexes, switch it so that it only points at this newly loaded index. * Delete any indexes that were previously pointed at by the alias, leaving only the newly loaded index. """ host = luigi.Parameter( is_list=True, config_path={ 'section': 'elasticsearch', 'name': 'host' }, description= 'Hostnames for the elasticsearch cluster nodes. They can be specified in any of the formats' ' accepted by the elasticsearch-py library. This includes complete URLs such as http://foo.com/, or' ' host port pairs such as foo:8000. Note that if you wish to use SSL you should specify a full URL' ' and the "https" scheme.') timeout = luigi.FloatParameter( config_path={ 'section': 'elasticsearch', 'name': 'timeout' }, significant=False, default=60, description= 'Maximum number of seconds to wait when attempting to make connections to the elasticsearch cluster' ' before assuming the cluster is not responding and giving up with a timeout error.' ) connection_type = luigi.Parameter( config_path={ 'section': 'elasticsearch', 'name': 'connection_type' }, significant=False, default='urllib', description= 'If not specified, default to using urllib3 to make HTTP requests to elasticsearch. The other valid' ' value is "aws" which can be used to connect to clusters that are managed by AWS. See' ' `AWS elasticsearch service <https://aws.amazon.com/elasticsearch-service/>`_' ) alias = luigi.Parameter( description= 'Name of the alias in elasticsearch that will point to the complete index when loaded. This value ' ' should match the settings of edx-analytics-data-api.') number_of_shards = luigi.Parameter( default=None, description= 'Number of `shards <https://www.elastic.co/guide/en/elasticsearch/reference/current/glossary.html' '#glossary-shard>`_ to use in the elasticsearch index.') throttle = luigi.FloatParameter( default=0.1, significant=False, description= 'Wait this many seconds between batches of records submitted to the cluster to be indexed. This can' ' be used to tune the indexing process, allowing the cluster to successfully "keep up" with the' ' loader. Note that often the hadoop cluster can load records much more quickly than the cluster' ' can index them, which eventually causes queues to overflow within the elasticsearch cluster.' ) batch_size = luigi.IntParameter( default=1000, significant=False, description= 'Number of records to submit to the cluster to be indexed in a single request. A small value here' ' will result in more, smaller, requests and a larger value will result in fewer, bigger requests.' ) indexing_tasks = luigi.IntParameter( default=None, significant=False, description= 'Number of parallel processes to use to submit records to be indexed from. The stream of records' ' will be divided up evenly among these processes during the indexing procedure.' ) max_attempts = luigi.IntParameter( default=10, significant=False, description= 'If the elasticsearch cluster rejects a batch of records (usually because it is too busy) the' ' indexing process will retry up to this many times before giving up. It uses an exponential back-' 'off strategy, so a high value here can result in very significant wait times before retrying.' ) # These attributes should be overridden, but don't need to be. settings = {} properties = {} def __init__(self, *args, **kwargs): super(ElasticsearchIndexTask, self).__init__(*args, **kwargs) self.other_reduce_tasks = self.n_reduce_tasks if self.indexing_tasks is not None: self.n_reduce_tasks = self.indexing_tasks self.batch_index = 0 self.index = self.alias + '_' + str(hash(self.update_id())) self.indexes_for_alias = set() def init_local(self): super(ElasticsearchIndexTask, self).init_local() elasticsearch_client = self.create_elasticsearch_client() # Find all indexes that are referred to by this alias (currently). These will be deleted after a successful # load of the new index. aliases = elasticsearch_client.indices.get_aliases(name=self.alias) self.indexes_for_alias.update([ index for index, alias_info in aliases.iteritems() if self.alias in alias_info['aliases'].keys() ]) if self.index in self.indexes_for_alias: if not self.overwrite: raise RuntimeError( 'Index {0} is currently in use by alias {1}'.format( self.index, self.alias)) else: # These indexes will be deleted, after the alias swap, make sure we don't delete the index we just # populated. self.indexes_for_alias.remove(self.index) if not self.overwrite and len(self.indexes_for_alias) > 1: raise RuntimeError( 'Invalid state, multiple existing indexes ({0}) found for alias {1}' .format(', '.join(self.indexes_for_alias), self.alias)) # In order for the OverwriteOutputMixin to recognize that this task has run we need to let it know. This will # allow it to actually check if the task is complete after it is run. self.attempted_removal = True if elasticsearch_client.indices.exists(index=self.index): elasticsearch_client.indices.delete(index=self.index) settings = { 'refresh_interval': -1, } if self.number_of_shards is not None: settings['number_of_shards'] = self.number_of_shards if self.settings: settings.update(self.settings) elasticsearch_client.indices.create(index=self.index, body={ 'settings': settings, 'mappings': { self.doc_type: { 'properties': self.properties } } }) def create_elasticsearch_client(self): """Build an elasticsearch client using the various parameters passed into this task.""" kwargs = {} if self.connection_type == 'aws': kwargs['connection_class'] = AwsHttpConnection return elasticsearch.Elasticsearch( hosts=self.host, timeout=self.timeout, retry_on_status=(HTTP_CONNECT_TIMEOUT_STATUS_CODE, HTTP_GATEWAY_TIMEOUT_STATUS_CODE), retry_on_timeout=True, **kwargs) def mapper(self, line): yield (random.randrange(int(self.n_reduce_tasks)), line.rstrip('\r\n')) def reducer(self, _key, lines): """ Given a batch of records, transmit them to the elasticsearch cluster to be indexed. There should be one reducer per parallel indexing thread. Controlling the number of reducers is the way to control the level of parallelism in the load process. """ elasticsearch_client = self.create_elasticsearch_client() document_iterator = self.document_generator(lines) first_batch = True while True: bulk_action_batch = self.next_bulk_action_batch(document_iterator) if not bulk_action_batch: break if not first_batch and self.throttle: time.sleep(self.throttle) first_batch = False if self.send_bulk_action_batch(elasticsearch_client, bulk_action_batch): self.incr_counter('Elasticsearch', 'Committed Batches', 1) # Note that each document produces two entries in the bulk_action_batch list. num_records = len(bulk_action_batch) / 2 self.incr_counter('Elasticsearch', 'Records Indexed', num_records) else: raise IndexingError( 'Batch of records rejected too many times. Aborting.') # Luigi requires the reducer to actually return something, so we just return empty strings that are written # to a temp file in HDFS that is immediately cleaned up after the job finishes. yield ('', '') def next_bulk_action_batch(self, document_iterator): """ Read a batch of documents from the iterator and convert them into bulk index actions. Elasticsearch expects each document to actually be transmitted on two lines the first of which details the action to take, and the second contains the actual document. See the `Cheaper in Bulk <https://www.elastic.co/guide/en/elasticsearch/guide/1.x/bulk.html>`_ guide. Arguments: document_iterator (iterator of dicts): Returns: A list of dicts that can be transmitted to elasticsearch using the "bulk" request. """ bulk_action_batch = [] for raw_data in islice(document_iterator, self.batch_size): action, data = elasticsearch.helpers.expand_action(raw_data) bulk_action_batch.append(action) if data is not None: bulk_action_batch.append(data) return bulk_action_batch def send_bulk_action_batch(self, elasticsearch_client, bulk_action_batch): """ Given a batch of actions, transmit them in bulk to the elasticsearch cluster. This method handles back-pressure from the elasticsearch cluster which queues up writes. When the queue is full the cluster will start rejecting additional bulk indexing requests. This method implements an exponential back-off, allowing the cluster to catch-up with the client. Arguments: elasticsearch_client (elasticsearch.Elasticsearch): A reference to an elasticsearch client. bulk_action_batch (list of dicts): A list of bulk actions followed by their respective documents. Raises: IndexingError: If a record cannot be indexed by elasticsearch this method assumes that is a fatal error and it immediately raises this exception. If we try to transmit a batch repeatedly and it is continually rejected by the cluster, this method will give up after `max_attempts` and raise this error. Returns: True iff the batch of actions was successfully transmitted to and acknowledged by the elasticsearch cluster. """ attempts = 0 batch_written_successfully = False while True: try: resp = elasticsearch_client.bulk(bulk_action_batch, index=self.index, doc_type=self.doc_type) except TransportError as transport_error: if transport_error.status_code not in ( REJECTED_REQUEST_STATUS, HTTP_SERVICE_UNAVAILABLE_STATUS_CODE): raise transport_error else: num_errors = 0 for raw_data in resp['items']: _op_type, item = raw_data.popitem() successful = 200 <= item.get('status', 500) < 300 if not successful: log.error('Failed to index: %s', str(item)) num_errors += 1 if num_errors == 0: batch_written_successfully = True break else: raise IndexingError( 'Failed to index {0} records. Aborting.'.format( num_errors)) attempts += 1 if attempts < self.max_attempts: sleep_duration = 2**attempts self.incr_counter('Elasticsearch', 'Rejected Batches', 1) log.warn( 'Batch of records rejected. Sleeping for %d seconds before retrying.', sleep_duration) time.sleep(sleep_duration) else: batch_written_successfully = False break return batch_written_successfully def document_generator(self, lines): """ Given lines of raw text, generates structured documents that will be indexed by elasticsearch. The returned document should have roughly the following structure: { "_id": "(optional) your custom identifier for the document", "_source": { "prop0": "you should have one key-value pair for each property and its value" } } Note that you can also specify other "special" fields other than "_id": - _index - _parent - _percolate - _routing - _timestamp - _ttl - _type - _version - _version_type - _retry_on_conflict The "_source" field is required. Arguments: lines (iterable of unicode strings): This is the raw data to be indexed. Yields: dict: The document to index in the format expected by the elasticsearch bulk loading process. """ raise NotImplementedError @property def doc_type(self): """ Elasticsearch `document type <https://www.elastic.co/guide/en/elasticsearch/guide/current/mapping.html>`_. """ raise NotImplementedError def extra_modules(self): import urllib3 packages = [elasticsearch, urllib3] return packages def jobconfs(self): jcs = super(ElasticsearchIndexTask, self).jobconfs() jcs.append('mapred.reduce.tasks.speculative.execution=false') return jcs def update_id(self): """A unique identifier for this task instance that is used to determine if it should be run again.""" return self.task_id def output(self): return ElasticsearchTarget(client=self.create_elasticsearch_client(), index=self.alias, doc_type=self.doc_type, update_id=self.update_id()) def commit(self): """ If all documents have been loaded successfully, make the changes visible to users. """ # The ordering of operations here is sensitive. elasticsearch_client = self.create_elasticsearch_client() # First "refresh" the newly loaded index. We disable refreshes during the load to keep throughput high. This # step is necessary to ensure all of the documents are properly indexed and user-visible. elasticsearch_client.indices.refresh(index=self.index) # Perform an atomic swap of the alias. actions = [] old_indexes = [ ix for ix in self.indexes_for_alias if elasticsearch_client.indices.exists(index=ix) ] for old_index in old_indexes: actions.append( {"remove": { "index": old_index, "alias": self.alias }}) actions.append({"add": {"index": self.index, "alias": self.alias}}) elasticsearch_client.indices.update_aliases({"actions": actions}) # Update the luigi metadata to indicate that the task ran successfully. self.output().touch() # Attempt to remove any old indexes that are now no longer user-visible. for old_index in old_indexes: elasticsearch_client.indices.delete(index=old_index) def rollback(self): """ If something goes wrong during the load, attempt to clean up the partially loaded index. """ elasticsearch_client = self.create_elasticsearch_client() try: if elasticsearch_client.indices.exists(index=self.index): elasticsearch_client.indices.delete(index=self.index) except Exception: # pylint: disable=broad-except log.exception("Unable to rollback the elasticsearch load.") def run(self): try: super(ElasticsearchIndexTask, self).run() except Exception: # pylint: disable=broad-except self.rollback() raise else: self.commit()
global_bool_param = luigi.BoolParameter(is_global=True, default=False) def run(self): self.complete = lambda: True def complete(self): return False class HasGlobalParamDep(luigi.Task): x = luigi.Parameter() def requires(self): return HasGlobalParam(self.x) _shared_global_param = luigi.Parameter(is_global=True, default='123') class SharedGlobalParamA(luigi.Task): shared_global_param = _shared_global_param class SharedGlobalParamB(luigi.Task): shared_global_param = _shared_global_param class BananaDep(luigi.Task): x = luigi.Parameter() y = luigi.Parameter(default='def') def output(self):
class RayTracingLoop(QueenbeeTask): """Run ray-tracing and post-process the results for a point-in-time simulation.""" # DAG Input parameters _input_params = luigi.DictParameter() # Task inputs @property def radiance_parameters(self): return self._input_params['radiance_parameters'] @property def metric(self): return self._input_params['metric'] fixed_radiance_parameters = luigi.Parameter(default='-h') @property def grid(self): value = pathlib.Path(self.input()['SplitGrid']['output_folder'].path, self.item['path']) return value.as_posix() if value.is_absolute() \ else pathlib.Path(self.initiation_folder, value).resolve().as_posix() @property def scene_file(self): value = pathlib.Path(self._input_params['octree_file']) return value.as_posix() if value.is_absolute() \ else pathlib.Path(self.initiation_folder, value).resolve().as_posix() # get item for loop try: item = luigi.DictParameter() except Exception: item = luigi.Parameter() @property def execution_folder(self): return pathlib.Path(self._input_params['simulation_folder'], 'results').resolve().as_posix() @property def initiation_folder(self): return pathlib.Path(self._input_params['simulation_folder']).as_posix() @property def params_folder(self): return pathlib.Path( self.execution_folder, self._input_params['params_folder']).resolve().as_posix() def command(self): return 'honeybee-radiance raytrace point-in-time scene.oct grid.pts --rad-params "{radiance_parameters}" --rad-params-locked "{fixed_radiance_parameters}" --metric {metric} --output grid.res'.format( radiance_parameters=self.radiance_parameters, fixed_radiance_parameters=self.fixed_radiance_parameters, metric=self.metric) def requires(self): return {'SplitGrid': SplitGrid(_input_params=self._input_params)} def output(self): return { 'result': luigi.LocalTarget( pathlib.Path( self.execution_folder, '{item_name}.res'.format( item_name=self.item['name'])).resolve().as_posix()) } @property def input_artifacts(self): return [{ 'name': 'grid', 'to': 'grid.pts', 'from': self.grid, 'optional': False }, { 'name': 'scene_file', 'to': 'scene.oct', 'from': self.scene_file, 'optional': False }] @property def output_artifacts(self): return [{ 'name': 'result', 'from': 'grid.res', 'to': pathlib.Path(self.execution_folder, '{item_name}.res'.format( item_name=self.item['name'])).resolve().as_posix() }]
class PrepareMovielensData(luigi.Task): ''' Splits the data into training, validation and testing. Reindex it according to the okapi needs, i.e., the item and user index starts at 0 The output contains 4 files: testing, training, info, and validation. The info file contains info about the training data set (#user, #items, etc). I is used for testing the model. ''' fraction = luigi.Parameter(description="The fraction of data we want to use", default=1.0) #remaking ids for okapi: starts 1 (items -1), no gaps, no new items in the test/validation set training_users = {"original_id" :0} training_items = {"original_id": 0} def requires(self): return DownloadMovielens() def output(self): return [luigi.hdfs.HdfsTarget('movielens.testing_{}'.format(self.fraction)), luigi.hdfs.HdfsTarget('movielens.training_{}'.format(self.fraction)), luigi.hdfs.HdfsTarget('movielens.training.info_{}'.format(self.fraction)), luigi.hdfs.HdfsTarget('movielens.validation_{}'.format(self.fraction))] def local_output(self): return [luigi.file.File(tempfile.gettempdir()+'/movielens.testing_{}'.format(self.fraction)), luigi.file.File(tempfile.gettempdir()+'/movielens.training_{}'.format(self.fraction)), luigi.file.File(tempfile.gettempdir()+'/movielens.training.info_{}'.format(self.fraction)), luigi.file.File(tempfile.gettempdir()+'/movielens.validation_{}'.format(self.fraction))] def _get_id(self, original_id, dictionary): id = dictionary.get(original_id, len(dictionary)) dictionary[original_id] = id return id def run(self): ''' 1. 70% entries go to training, others go to memory 2. from memory, items and users that are in training 33% of items go into validation, 66% go to testing ''' frac = float(self.fraction) import random random.seed(123)#just that all user would have the same data sets f = self.input().open('r') # this will return a file stream that reads from movielens ratings.dat training = self.local_output()[1].open('w') hdfs_client = luigi.hdfs.HdfsClient() #lets first write training set and store in memory user and item indexes testing_validation = [] cnt = 0 for line in f: if random.random() > frac: continue user,item,rating,time = line.split("::") rating = int(float(rating)) if random.random() < 0.7: #write to training userid = self._get_id(user, self.training_users) itemid = self._get_id(item, self.training_items) training.write("{0} {1} {2}\n".format(userid, itemid, rating)) cnt += 1 else: testing_validation.append((user, item, rating)) training.close() # needed because files are atomic #now lets write out the testing and validation testing = self.local_output()[0].open('w') validation = self.local_output()[3].open('w') for u,i,rating in testing_validation: if u in self.training_users and i in self.training_items: if random.random() < 0.33: validation.write('{0} {1} {2}\n'.format(self.training_users[u], self.training_items[i], rating)) else: testing.write('{0} {1} {2}\n'.format(self.training_users[u], self.training_items[i], rating)) testing.close() validation.close() f.close() info = self.local_output()[2].open('w') info.write('n_users: {0}, n_items: {1}, n_entries: {2}\n'.format(len(self.training_users), len(self.training_items), cnt)) info.close() hdfs_client.put(self.local_output()[0].path, self.output()[0].path) hdfs_client.put(self.local_output()[1].path, self.output()[1].path) hdfs_client.put(self.local_output()[2].path, self.output()[2].path) hdfs_client.put(self.local_output()[3].path, self.output()[3].path)
class OkapiTrainModelTask(luigi.hadoop_jar.HadoopJarJobTask): '''Trains a model''' fraction = luigi.Parameter(description="The fraction of data we want to use", default=1.0) model_name = luigi.Parameter(description="The model: {"+" | ".join(methods)+"}") out_hdfs = luigi.Parameter(description="Output dir for the task") def requires(self): #we need to delete a special zookeeper dir because of some strange behaviour return PrepareMovielensData(self.fraction) def output(self): return luigi.hdfs.HdfsTarget(self.out_hdfs) def _get_conf(self, section, name): return luigi.configuration.get_config().get(section, name) def get_computation_class(self): if self.model_name in methods: return methods[self.model_name] else: raise "Not implemented method. Please choose from {"+ " | ".join(methods.keys())+"}" def get_input_format(self): return 'ml.grafos.okapi.cf.CfLongIdFloatTextInputFormat' def get_output_format(self): return 'org.apache.giraph.io.formats.IdWithValueTextOutputFormat' def get_input(self): training = self.input()[1].path return training def get_output(self): return self.out_hdfs def run(self): self.set_hadoop_classpath() DeleteDir(self._get_conf("hadoop", "zookeeper-dir")).run() super(OkapiTrainModelTask, self).run() def get_libjars(self): return [self.giraph_jar(), self.okapi_jar()] def set_hadoop_classpath(self): '''we need to put our jars into the classpath of the hadoop''' hadoop_cp = ':'.join(filter(None, self.get_libjars())) if os.environ.get('HADOOP_CLASSPATH', None): if not hadoop_cp in os.environ['HADOOP_CLASSPATH']: os.environ['HADOOP_CLASSPATH'] = os.environ['HADOOP_CLASSPATH']+":"+hadoop_cp else: os.environ['HADOOP_CLASSPATH'] = hadoop_cp logger.debug("HADOOP_CLASSPATH={0}".format(os.environ['HADOOP_CLASSPATH'])) def jar(self): return self.giraph_jar() def main(self): return 'org.apache.giraph.GiraphRunner' def get_jar(self, group, jarname): config = luigi.configuration.get_config() jar = config.get(group, jarname) if not jar: logger.error("You must specify {0} in client.cfg".format(jarname)) raise if not os.path.exists(jar): logger.error("Can't find {0} jar: ".format(jarname)) raise return jar def okapi_jar(self): return self.get_jar("okapi", "okapi-jar") def giraph_jar(self): return self.get_jar("okapi", "giraph-jar") def get_custom_arguments(self, info_filename): #we check how many items there are in the training set f = info_filename.open() line = f.readlines() f.close() maxItems = int(line[0].split(",")[1].split(':')[1]) return ['-ca', 'minItemId=1', '-ca', 'maxItemId='+str(maxItems-1)] def get_custom_method_params(self, model_name): if model_name=="SGD": return ['-mc', 'ml.grafos.okapi.cf.sgd.Sgd$MasterCompute', '-ca', 'iterations=20', '-ca', 'gamma=0.005', '-ca', 'lambda=0.01', '-ca', 'dim=20', '-ca', 'debug=true'] elif model_name=="ALS": return ['-mc', 'ml.grafos.okapi.cf.als.Als$MasterCompute', '-ca', 'iterations=20', '-ca', 'lambda=0.01', '-ca', 'dim=20', '-ca', 'debug=true'] elif model_name=="SVD": return ['-mc', 'ml.grafos.okapi.cf.svd.Svdpp$MasterCompute', '-ca', 'iterations=20', '-ca', 'dim=20', '-ca', 'debug=true'] else: return [] def args(self): return [ "-libjars", ",".join(self.get_libjars()), "-Dmapred.child.java.opts="+self._get_conf('hadoop', 'hadoop-mem'), "-Dgiraph.zkManagerDirectory="+self._get_conf('hadoop', 'zookeeper-dir'), "-Dgiraph.useSuperstepCounters=false", self.get_computation_class(), '-eif', self.get_input_format(), '-eip', self.get_input(), '-vof', self.get_output_format(), '-op', self.get_output(), '-w', self._get_conf("okapi", "workers"), '-ca', "giraph.numComputeThreads="+self._get_conf('okapi', 'threads')] \ + self.get_custom_arguments(self.input()[2]) \ + self.get_custom_method_params(self.model_name)
def testWithDefaultAndMissing(self): p = luigi.Parameter(config_path=dict(section="foo", name="bar"), default='blah') self.assertEqual('blah', p.value)
class A(luigi.Task): p = luigi.Parameter(config_path=dict(section="foo", name="bar"))
def testGlobalAndMissing(self): p = luigi.Parameter(config_path=dict(section="foo", name="bar"), is_global=True, default='blah') self.assertEqual('blah', p.value) p.set_global('meh') self.assertEqual('meh', p.value)
class WithDefault(luigi.Task): x = luigi.Parameter(default='xyz')
class HasGlobalParamDep(luigi.Task): x = luigi.Parameter() def requires(self): return HasGlobalParam(self.x)
class Foo(luigi.Task): bar = luigi.Parameter() p2 = luigi.IntParameter() multi = luigi.Parameter(is_list=True) not_a_param = "lol"
class ExtractHkma(ExtractHttp): domain = luigi.Parameter( default= "http://www.hkma.gov.hk/media/eng/doc/market-data-and-statistics/monthly-statistical-bulletin" )
def testHasDefaultNoValue(self): self.assertFalse(luigi.Parameter(config_path=dict(section="foo", name="bar")).has_value)
class QueryDb(_utils.DataPreparationTask): """Make an SQL query and store the results into an output file.""" query = luigi.Parameter( description="The SQL query to perform on the DB" ) args = _utils.ObjectParameter( default=(), description="The SQL query's positional arguments" ) kwargs = _utils.ObjectParameter( default={}, description="The SQL query's named arguments" ) limit = luigi.parameter.IntParameter( default=-1, description="The maximum number of rows to fetch. Optional. If -1, " "all rows will be fetched.") shuffle = luigi.BoolParameter( default=False, description="If True, all rows will be shuffled. For debugging and " "exploration purposes. Might impact performance.") def output(self): return luigi.LocalTarget( f'{self.output_dir}/{self.task_id}.csv', format=UTF8 ) def run(self): query = self.build_query() rows, columns = self.db_connector.query_with_header( query, *self.args, **self.kwargs) df = pd.DataFrame(rows, columns=columns) df = self.transform(df) self.write_output(df) def build_query(self): query = self.query if self.shuffle: query += ' ORDER BY RANDOM()' if self.minimal_mode and self.limit == -1: self.limit = 50 if self.limit and self.limit != -1: query += f' LIMIT {self.limit}' return query def transform(self, df): """Provide a hook for subclasses.""" return df def write_output(self, df): with self.output().open('w') as output_stream: df.to_csv(output_stream, index=False, header=True)
def testHasDefaultWithBoth(self): self.assertTrue(luigi.Parameter(config_path=dict(section="foo", name="bar")).has_value)
class MortarProjectTask(MortarTask): """ Luigi Task to run a job on the Mortar platform. If the job fails, the task will exit with an error. To use this class, define the following section in your Luigi client configuration file: ::[mortar] ::email: ${MORTAR_EMAIL} ::api_key: ${MORTAR_API_KEY} ::host: api.mortardata.com ::project_name: ${MORTAR_PROJECT_NAME} see also:: https://help.mortardata.com/technologies/luigi/mortar_tasks """ # A cluster size of 2 or greater will use a Hadoop cluster. If there # is an idle cluster of cluster_size or greater that cluster will be used. # Otherwise a new cluster will be started. # A cluster size of 0 will run the Mortar job directly on the Mortar Pig # server in local mode (no cluster). # All other cluster_size values are invalid. cluster_size = luigi.IntParameter(default=2) # A single use cluster will be terminated immediately after this # Mortar job completes. Otherwise it will be terminated automatically # after being idle for one hour. # This option does not apply when running the Mortar job in local mode # (cluster_size = 0). run_on_single_use_cluster = luigi.BooleanParameter(False) # If False, this task will only run on an idle cluster or will # start up a new cluster if no idle clusters are found. If True, # this task may run on a cluster that has other jobs already running on it. # If run_on_single_use_cluster is True, this parameter will be ignored. share_running_cluster = luigi.BooleanParameter(False) # Whether a launched Hadoop cluster will take advantage of AWS # Spot Pricing (https://help.mortardata.com/technologies/hadoop/spot_instance_clusters) # This option does not apply when running in local mode (cluster_size = 0). use_spot_instances = luigi.BooleanParameter(True) # The Git reference (commit hash or branch name) to use when running # this Mortar job. The default value NO_GIT_REF_FLAG is a flag value # that indicates no value was entered as a parameter. If no value # is passed as a parameter the environment value "MORTAR_LUIGI_GIT_REF" # is used. If that is not set the "master" is used. git_ref = luigi.Parameter(default=NO_GIT_REF_FLAG) # Set to true to receive an email upon completion # of this Mortar job. notify_on_job_finish = luigi.BooleanParameter(default=False) # Internval (in seconds) to poll for job status. job_polling_interval = luigi.IntParameter(default=5) # Number of retries before giving up on polling. num_polling_retries = luigi.IntParameter(default=3) # Version of Pig to use. pig_version = luigi.Parameter(default='0.12') def project(self): """ Override this method to provide the name of the Mortar Project. :rtype: str: :returns: Your project name, e.g. my-mortar-recsys """ if luigi.configuration.get_config().has_option('mortar', 'project_name'): project_name = luigi.configuration.get_config().get( 'mortar', 'project_name') return project_name raise RuntimeError( "Please implement the project method or provide a project_name configuration item to return your project name" ) @abc.abstractmethod def script(self): """ Override this method to provide the name of the script to run. :rtype: str: :returns: Script name, e.g. my_pig_script """ raise RuntimeError( "Please implement the script method to return your script name") @abc.abstractmethod def is_control_script(self): """ [DEPRECATED] Whether this job should run a control script. :rtype: bool: :returns: [DEPRECATED] whether this job should run a control script """ raise RuntimeError("Please implement the is_control_script method") def parameters(self): """ This method defines the parameters that Mortar will pass to your your script when it runs. :rtype: dict: :returns: dict of parameters to pass, e.g. {'my-param': 'my-value'}. Default: {} """ return {} def output(self): """ The output for this Task. Returns the `success_token` by default, so the Task only runs if a token indiciating success has not been previously written. :rtype: list of Target: :returns: list containing one output, the `success_token` """ return [self.success_token()] def token_path(self): """ The MortarProjectTask writes out several "tokens" as it executes, indicating whether it is Running and then when it is Complete. These tokens are used to ensure that the task is not rerun once it has already completed. This method provides the base path where those tokens are written. By default, tokens are written to a temporary directory on the file system. However, for running in a cluster setting, you should overrides this method to use an S3 path (e.g. s3://my-bucket/my-token-path), ensuring that tokens will be available from any machine. :rtype: str: :returns: default token path on file system - file://tempdirectory """ # override with S3 path for usage across machines or on clusters return 'file://%s' % tempfile.gettempdir() @abc.abstractmethod def script_output(self): """ List of locations where your script writes output. If your script fails, Luigi will clear any output from these locations to ensure that the next run of your Task is idempotent. :rtype: list of Target: :returns: list of Target to clear in case of Task failure """ raise RuntimeError("Please implement the script_output method") def running_token(self): """ The MortarProjectTask writes out several "tokens" as it executes to ensure idempotence. This method provides the token file that indicates that the job is running. By default, it is stored underneath the path provided by the `token_path` method, and is named after your class name. So, if your `token_path` is set to `s3://my-bucket/my-folder` and your Task is named FooTask, the token will be: `s3://my-bucket/my-folder/FooTask-Running` This token will contain the Mortar job_id of the job that is running. :rtype: Target: :returns: Target for the token that indicates job is running. """ return target_factory.get_target( '%s/%s-%s' % (self.token_path(), self.__class__.__name__, 'Running')) def success_token(self): """ The MortarProjectTask writes out several "tokens" as it executes to ensure idempotence. This method provides the token file that indicates that the job has finished successfully. If this token exists, the Task will not be rerun. By default, it is stored underneath the path provided by the `token_path` method, and is named after your class name. So, if your `token_path` is set to `s3://my-bucket/my-folder` and your Task is named FooTask, the token will be: `s3://my-bucket/my-folder/FooTask` If you want this Task to be rerun, you should delete that token. :rtype: Target: :returns: Target for the token that indicates that this Task has succeeded. """ return target_factory.get_target( '%s/%s' % (self.token_path(), self.__class__.__name__)) def run(self): """ Run a Mortar job using the Mortar API. This method writes out several "tokens" as it executes to ensure idempotence: * `running_token`: This token indicates that the job is currently running. If a token exists at this path, Luigi will poll the currently running job instead of starting a new one. * `success_token`: This token indicates that the job has already completed successfully. If this token exists, Luigi will not rerun the task. """ api = self._get_api() if self.running_token().exists(): job_id = self.running_token().open().read().strip() else: job_id = self._run_job(api) # to guarantee idempotence, record that the job is running target_factory.write_file(self.running_token(), text=job_id) job = self._poll_job_completion(api, job_id) final_job_status_code = job.get('status_code') # record that the job has finished self.running_token().remove() if final_job_status_code != jobs.STATUS_SUCCESS: for out in self.script_output(): logger.info( 'Mortar script failed: removing incomplete data in %s' % out) out.remove() raise Exception( 'Mortar job_id [%s] failed with status_code: [%s], error details: %s' % (job_id, final_job_status_code, job.get('error'))) else: target_factory.write_file(self.success_token()) logger.info('Mortar job_id [%s] completed successfully' % job_id) def _git_ref(self): """ Figure out value to use for git ref. Order of precendence is: 1. git_ref parameter is set. 2. environment variable MORTAR_LUIGI_GIT_REF is set 3. master """ if self.git_ref != NO_GIT_REF_FLAG: return self.git_ref else: import os env_git_ref = os.environ.get('MORTAR_LUIGI_GIT_REF') if env_git_ref: return env_git_ref else: return 'master' def _run_job(self, api): cluster_type = clusters.CLUSTER_TYPE_SINGLE_JOB if self.run_on_single_use_cluster \ else clusters.CLUSTER_TYPE_PERSISTENT cluster_id = None if self.cluster_size == 0: # Use local cluster cluster_id = clusters.LOCAL_CLUSTER_ID elif not self.run_on_single_use_cluster: # search for a suitable cluster usable_clusters = self._get_usable_clusters( api, min_size=self.cluster_size) if usable_clusters: # grab the largest usable cluster largest_cluster = sorted(usable_clusters, key=lambda c: int(c['size']), reverse=True)[0] logger.info('Using largest running usable cluster with cluster_id [%s], size [%s]' % \ (largest_cluster['cluster_id'], largest_cluster['size'])) cluster_id = largest_cluster['cluster_id'] if cluster_id: job_id = jobs.post_job_existing_cluster( api, self.project(), self.script(), cluster_id, git_ref=self._git_ref(), parameters=self.parameters(), notify_on_job_finish=self.notify_on_job_finish, is_control_script=self.is_control_script(), pig_version=self.pig_version, pipeline_job_id=self._get_pipeline_job_id()) else: job_id = jobs.post_job_new_cluster( api, self.project(), self.script(), self.cluster_size, cluster_type=cluster_type, git_ref=self._git_ref(), parameters=self.parameters(), notify_on_job_finish=self.notify_on_job_finish, is_control_script=self.is_control_script(), pig_version=self.pig_version, use_spot_instances=self.use_spot_instances, pipeline_job_id=self._get_pipeline_job_id()) logger.info('Submitted new job to mortar with job_id [%s]' % job_id) return job_id def _get_usable_clusters(self, api, min_size=0): return [cluster for cluster in clusters.get_clusters(api)['clusters'] \ if ( (cluster.get('status_code') == clusters.CLUSTER_STATUS_RUNNING) and (cluster.get('cluster_type_code') != clusters.CLUSTER_TYPE_SINGLE_JOB) and (int(cluster.get('size')) >= min_size) and ( len(cluster.get('running_jobs')) == 0 or self.share_running_cluster) )] def _poll_job_completion(self, api, job_id): current_job_status = None current_progress = None exception_count = 0 while True: try: # fetch job job = jobs.get_job(api, job_id) new_job_status = job.get('status_code') # check for updated status if new_job_status != current_job_status: current_job_status = new_job_status logger.info('Mortar job_id [%s] switched to status_code [%s], description: %s' % \ (job_id, new_job_status, self._get_job_status_description(job))) # check for updated progress on running job if (new_job_status == jobs.STATUS_RUNNING) and ( job.get('progress') != current_progress): current_progress = job.get('progress') logger.info('Mortar job_id [%s] progress: [%s%%]' % (job_id, current_progress)) # final state if current_job_status in jobs.COMPLETE_STATUSES: return job else: # reset exception count on successful loop exception_count = 0 # sleep and continue polling time.sleep(self.job_polling_interval) except Exception, e: if exception_count < self.num_polling_retries: exception_count += 1 logger.info('Failure to get job status for job %s: %s' % (job_id, str(e))) time.sleep(self.job_polling_interval) else: raise
def testDefaultList(self): p = luigi.Parameter(is_list=True, config_path=dict(section="foo", name="bar")) self.assertEqual(('one', 'two', 'three'), p.value)
class MakeClickTrainData(gokart.TaskOnKart): task_namespace = 'redshells.word_item_similarity' click_data_task = gokart.TaskInstanceParameter() min_user_count = luigi.IntParameter(default=100) # type: int min_item_count = luigi.IntParameter(default=100) # type: int max_item_frequency = luigi.FloatParameter(default=0.05) # type: float user_column_name = luigi.Parameter() # type: str item_column_name = luigi.Parameter() # type: str service_column_name = luigi.Parameter() # type: str output_file_path = luigi.Parameter(default='app/word_item_similarity/clicks_train_data.pkl') # type: str def requires(self): return self.click_data_task def output(self): return self.make_target(self.output_file_path) def run(self): data = self.load_data_frame(required_columns={self.user_column_name, self.item_column_name, self.service_column_name}) data = pd.concat([self._make_click_data(grouped) for name, grouped in data.groupby(self.service_column_name)]) logger.info('dumping...') self.dump(data) def _make_click_data(self, data: pd.DataFrame): logger.info(f'filtering... size={data.shape}') data = self._filter_data(data) logger.info(f'size={data.shape}') data['click'] = 1 logger.info(f'data size is {data.shape}.') logger.info('sampling...') negative = self._sample_negative_examples(data) logger.info(f'negative samples size is {negative.shape}.') logger.info('concatenating...') data = pd.concat([data, negative], sort=False) return data def _sample_negative_examples(self, df: pd.DataFrame) -> pd.DataFrame: logger.info('preprocessing...') user_ids = df[self.user_column_name].unique() item_ids = df[self.item_column_name].unique() item2service = dict(zip(df[self.item_column_name].tolist(), df[self.service_column_name].tolist())) user2index = dict(zip(user_ids, list(range(len(user_ids))))) item2index = dict(zip(item_ids, list(range(len(item_ids))))) n_users = len(user_ids) n_items = len(item_ids) positive_examples = set(list(df[self.user_column_name].apply(user2index.get).values + df[self.item_column_name].apply(item2index.get).values * n_users)) n_positive_examples = len(positive_examples) logger.info('negative sampling...') negative_examples = set(np.random.randint(low=0, high=n_users * n_items, size=n_positive_examples * 2)) logger.info('making unique list...') negative_examples = np.array(list(negative_examples - positive_examples)) logger.info('shuffling...') negative_examples = sklearn.utils.shuffle(negative_examples) negative_examples = negative_examples[:n_positive_examples] logger.info('making data frame...') examples = pd.DataFrame(dict(user_id=negative_examples % n_users, item_id=negative_examples // n_users, click=0)) examples[self.user_column_name] = user_ids[examples[self.user_column_name].values] examples[self.item_column_name] = item_ids[examples[self.item_column_name].values] examples[self.service_column_name] = examples[self.item_column_name].apply(item2service.get) examples.drop_duplicates(inplace=True) return examples def _filter_data(self, df: pd.DataFrame) -> pd.DataFrame: df.drop_duplicates(inplace=True) n_users = len(set(df[self.user_column_name])) max_item_count = n_users * self.max_item_frequency logger.info(f'max_item_count={max_item_count}') logger.info(f'min_item_count={self.min_item_count}') logger.info(f'min_user_count={self.min_user_count}') df = df.groupby(self.item_column_name).filter(lambda xs: self.min_item_count <= len(xs) <= max_item_count) df = df.groupby(self.user_column_name).filter(lambda xs: self.min_user_count <= len(xs)) return df
def testWithDefault(self): p = luigi.Parameter(config_path=dict(section="foo", name="bar"), default='blah') self.assertEqual('baz', p.value) # config overrides default
class DictFile(luigi.ExternalTask): hdfs_path = luigi.Parameter() def output(self): return luigi.contrib.hdfs.HdfsTarget(self.hdfs_path)
class StateFiles(luigi.WrapperTask): year = luigi.IntParameter() state = luigi.Parameter() def requires(self): return CodeTables(), DataTables(year=self.year, state=self.state)