A wrapper of the data pipeline library "luigi".
Please use gokart.TaskOnKart instead of luigi.Task to define your tasks.
import gokart
class BasicTask(gokart.TaskOnKart):
def requires(self):
return TaskA()
def output(self):
# please use TaskOnKart.make_target to make Target.
return self.make_target('basic_task.csv')
def run(self):
# load data which TaskA output
texts = self.load()
# do something with texts, and make results.
# save results with the file path {self.workspace_directory}/basic_task_{unique_id}.csv
self.dump(results)
TaskOnKart.make_target
judge Target
type by the passed path extension. The following extensions are supported.
- pkl
- txt
- csv
- tsv
- gz
TaskOnKart.make_model_target
and TaskOnKart.dump
are designed to save and load models like gensim.model.Word2vec.
class TrainWord2Vec(TaskOnKart):
def output(self):
# please use 'zip'.
return self.make_model_target(
'model.zip',
save_function=gensim.model.Word2Vec.save,
load_function=gensim.model.Word2Vec.load)
def run(self):
# make word2vec
self.dump(word2vec)
def requires(self):
return dict(data=LoadItemData(), model=LoadModel())
def run(self):
# pass a key in the dictionary `self.requires()`
data = self.load('data')
model = self.load('model')
def run(self):
input_data = self.load()
"""
The above line is equivalent to the following:
input_data = dict(data=self.load('data'), model=self.load('model'))
"""
def requires(self):
return LoadDataFrame()
def run(self):
data = self.load_data_frame(required_columns={'id', 'name'})