class TestTaskBase(TestBaseTask): different_grandchild = BoolParameter() use_dynamic_dependency = BoolParameter() def _build_child_task(self, use_dynamic_dependency): index_for_grandchild = 0 child_a = self.create_child_task(task_class=TestTaskChildA, index_for_grandchild=index_for_grandchild, use_dynamic_dependency=use_dynamic_dependency) if self.different_grandchild: index_for_grandchild = 1 child_b = self.create_child_task(task_class=TestTaskChildB, index_for_grandchild=index_for_grandchild, use_dynamic_dependency=use_dynamic_dependency) return child_a, child_b def register_required(self): if not self.use_dynamic_dependency: child_a, child_b = self._build_child_task(False) self.child_tasks = self.register_dependencies([child_a, child_b]) def run_task(self): if self.use_dynamic_dependency: child_a, child_b = self._build_child_task(True) yield from self.run_dependencies([child_a, child_b])
class PlotSearchLines(FigureMaker): num_delays: Sequence[int] labels: List[str] legend_title: Optional[str] = CustomParameter(default=None) sequential_colors = BoolParameter(default=False) with_reference = BoolParameter(default=False) legend_outside = True linestyle = ".-" reference_color = "silver" @property def colors(self) -> List: num_colors = len(self.labels) if self.sequential_colors: cmap = get_cmap("viridis", num_colors) return list(cmap.colors) else: return [f"C{i}" for i in range(num_colors)] def work(self): fig, axes = subplots(nrows=2, sharex=True, figsize=paperfig()) ax_top, ax_btm = axes ax_btm.set_xlabel("Number of delays") self.plot_on_axes(ax_top, ax_btm) self.add_legend(ax_top) fig.tight_layout() self.output().write(fig) def add_legend(self, ax): if self.legend_outside: loc_kwargs = dict(loc="center left", bbox_to_anchor=(0.98, 0.5)) else: loc_kwargs = dict(loc="best") labels = self.labels colors = self.colors if self.with_reference: labels += ["Proposed online BPF"] colors += [self.reference_color] add_colored_legend( parent=ax, labels=labels, colors=colors, title=self.legend_title, **loc_kwargs, ) @abstractmethod def plot_on_axes(self, ax_top: Axes, ax_btm: Axes): ...
class CleanedReviews(Task): subset = BoolParameter(default=True) # Output should be a local ParquetTarget in ./data, ideally a salted output, # and with the subset parameter either reflected via salted output or # as part of the directory structure requires = csci_utils_req.Requires() other = csci_utils_req.Requirement(YelpReviews) print(other) path = os.path.abspath('data/subset') + '/' output = csci_utils_req.TargetOutput( file_pattern=path, target_class=csci_task.ParquetTarget, ext="", storage_options=dict(requester_pays=True)) def run(self): numcols = ["funny", "cool", "useful", "stars"] dsk = self.input().read_dask(self) print(dsk) if self.subset: dsk = dsk.get_partition(0) out = ... self.output().write_dask(out, compression='gzip')
class SyncAllData(WrapperTask): ''' Sync all data to the linked CARTO account. ''' force = BoolParameter(default=False, significant=False) def requires(self): existing_table_versions = dict([ (r['tablename'], r['version']) for r in query_cartodb('SELECT * FROM obs_table').json()['rows'] ]) tables = dict([(k, v) for k, v in current_session().execute(''' SELECT tablename, t.version FROM observatory.obs_table t, observatory.obs_column_table ct, observatory.obs_column c WHERE t.id = ct.table_id AND c.id = ct.column_id AND t.tablename NOT IN ('obs_ffebc3eb689edab4faa757f75ca02c65d7db7327') AND c.weight > 0 ''')]) for tablename, version in tables.items(): if version > existing_table_versions.get(tablename): force = True else: force = self.force yield TableToCartoViaImportAPI(table=tablename, force=force)
class PipelineTask(BatchTask): local = BoolParameter(False, significant=False) @property def job_name(self): return '{}-{}'.format(self.task_family, self.sample_id)
class DumpS3(Task): ''' Uploads ``observatory`` schema dumped from :class:`~.carto.Dump` to `Amazon S3 <https://aws.amazon.com/s3/>`_, using credentials from ``.env``. Automatically updates :class:`~.meta.OBSDumpVersion`. :param timestamp: Optional date parameter, defaults to today. ''' timestamp = DateParameter(default=date.today()) force = BoolParameter(default=False, significant=False) def requires(self): return Dump(timestamp=self.timestamp) def run(self): shell('aws s3 cp {input} {output}'.format(input=self.input().path, output=self.output().path)) def output(self): path = self.input().path.replace('tmp/carto/Dump_', 'do-release-') path = path.replace('.dump', '/obs.dump') path = 's3://cartodb-observatory-data/{path}'.format(path=path) LOGGER.info(path) target = S3Target(path) if self.force: shell('aws s3 rm {output}'.format(output=path)) self.force = False return target
class AccountDimLoad(Task): LOCAL_ROOT = "./data/accountdim/" """ This task outputs a local ParquetTarget in ./data/accountdim/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() account_dim = Requirement(AccountDimVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "CUST_ACCT": "object", "SEGMENT_NAME": "object", "SVC_PLAN": "object", } dsk = self.input()["account_dim"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class LineDimLoad(Task): LOCAL_ROOT = "./data/linedim/" """ This task outputs a local ParquetTarget in ./data/linedim/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() line_dim = Requirement(LineDimVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "MDN": "object", "DEVICE_GROUPING": "object", "SALES_CHANNEL": "object", } dsk = self.input()["line_dim"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class LimitFactLoad(Task): LOCAL_ROOT = "./data/limitfact/" """ This task outputs a local ParquetTarget in ./data/limitfact/ for further load into the database. """ subset = BoolParameter(default=True) requires = Requires() limit_fact = Requirement(LimitFactVer) output = TargetOutput(target_class=ParquetTarget, file_pattern=LOCAL_ROOT, ext="") def run(self): dtype_dic = { "MTN": "object", "CUST_ACCT": "object", "LIMITING_DT": "object", "LIMIT_TYPE": "object", } dsk = self.input()["limit_fact"].read_dask(dtype=dtype_dic) if self.subset: dsk = dsk.get_partition(0) self.output().write_dask(dsk, compression="gzip", compute=True)
class ETLAnalysisPrint(Task): """ ETL analysis class to read_dask, compute and print. This is implemented by the different analysis tasks to just do that. Last bit of work for each task to do the computation and print. Parameters: subset: bool, used to subset true or false, default: True analysis_path: str, final results are stored as parquet files here Output: print the analysis dataframe for visual """ # Default parameters subset = BoolParameter(default=True) analysis_path = Parameter(default="./data/vaccine/") requires = Requires() def complete(self): """ Does really nothing and returns false. """ return False def run(self): """ Read the dask, compute and print. """ analysis_output_dataframe = self.input()["input_data"].read_dask() logging.info(analysis_output_dataframe.compute())
class CleanedReviews(Task): subset = BoolParameter(default=True) # Output should be a local ParquetTarget in ./data, ideally a salted output, # and with the subset parameter either reflected via salted output or # as part of the directory structure requires = Requires() reviews = Requirement(YelpReviews).__module__ output = TargetOutput(file_pattern='data/', target_class=ParquetTarget, glob='*.parquet') def run(self): numcols = ["funny", "cool", "useful", "stars"] strcols = ['review_id', 'user_id', 'business_id', 'text'] dsk = YelpReviews.output(self) dsk[numcols] = dsk[numcols].astype("float64") dsk = dsk.fillna(0) dsk['date'] = dsk['date'].astype('datetime64') dsk = dsk[dsk.review_id.apply(lambda x: len(str(x)) == 22)] #dsk = dsk.set_index(dsk.review_id) dsk[numcols] = dsk[numcols].astype("int64") if self.subset: dsk = dsk.get_partition(0) dsk[strcols] = dsk[strcols].astype("str") self.output().write_dask(collection=dsk, compression='gzip')
class PDFCatalogToS3(Task): timestamp = DateParameter(default=date.today()) force = BoolParameter(significant=False) def __init__(self, **kwargs): if kwargs.get('force'): try: shell('aws s3 rm s3://data-observatory/observatory.pdf') except: pass super(PDFCatalogToS3, self).__init__() def run(self): for target in self.output(): shell('aws s3 cp catalog/build/observatory.pdf {output} ' '--acl public-read'.format(output=target.path)) def output(self): return [ S3Target('s3://data-observatory/observatory.pdf'), S3Target( 's3://data-observatory/observatory-{timestamp}.pdf'.format( timestamp=self.timestamp)), ]
class BseBetMask(Task): bse_betmask_prefix = Parameter(default='') bet_threshold = FloatParameter(default=float(BET_THRESHOLD)) slicer_exec = Parameter(default='') mask_qc = BoolParameter(default=False) def run(self): bet_mask = self.bse_betmask_prefix._path + '_mask.nii.gz' if not isfile(bet_mask): cmd = (' ').join([ 'bet_mask.py', '-i', self.input(), '-o', self.bse_betmask_prefix, f'-f {self.bet_threshold}' if self.bet_threshold else '' ]) p = Popen(cmd, shell=True) p.wait() if p.returncode: return # mask the baseline image cmd = (' ').join([ 'ImageMath', '3', self.output()['bse'], 'm', self.output()['bse'], bet_mask ]) p = Popen(cmd, shell=True) p.wait() if self.slicer_exec or self.mask_qc: print( '\n\n** Check quality of created mask {} . Once you are done, save the (edited) mask as {} **\n\n' .format(bet_mask, self.output()['mask'])) if self.slicer_exec: cmd = (' ').join([ self.slicer_exec, '--python-code', '\"slicer.util.loadVolume(\'{}\'); ' 'slicer.util.loadLabelVolume(\'{}\')\"'.format( self.input(), bet_mask) ]) p = Popen(cmd, shell=True) p.wait() elif self.mask_qc: while 1: sleep(QC_POLL) if isfile(self.bse_betmask_prefix._path + 'Qc_mask.nii.gz'): break def output(self): mask = _mask_name(self.bse_betmask_prefix, self.slicer_exec, self.mask_qc) return dict(bse=self.input(), mask=mask)
class ETLAnalysis(Task): """Created an abstract class for conducting analysis of covid data at different levels - by country, by year, by month and by week. This is a luigi task and sub-classed by the different levels of covid data analysis tasks. The analysis abstract class requires Cleanup and the parquet files for performing the analysis and display. This abstract class has one analysis method to override / implement in their respective tasks. Each analysis should be a separate Luigi task, which computes its analysis and writes the result to parquet. To display to the terminal or answer a quiz, the output should be read back from the written parquet file. Parameters: subset: bool, True to process just one partition, False to process the entire dataset, default: True analysis_path: str, base directory to store output files Output: Dataframe stored in compressed Parquet format in {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/ """ subset = BoolParameter(default=True) analysis_path = Parameter(default="./data/covid/") requires = Requires() input_data = Requirement(CovidDataGlobalCleanupTask) # the output references a "sub_dir" parameter, which is expected to be defined # in a subclass output = TargetOutput( "{task.analysis_path}{task.sub_dir}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", ) def perform_analysis(self, df): """ this method will be implemented by sub-classes. """ raise NotImplementedError def run(self): """ Uses the data points we need for analysis -> Country_Region and Date calls the implemented perform_analysis method to do the calculations """ analysis_dataframe = self.input()["input_data"].read_dask() # invoke perform_analysis from the implemented sub-classes # only gets the aggregated analysis column and the calculated column output_dataframe = self.perform_analysis(analysis_dataframe) # write_dask parquet file output with gzip compression. self.output().write_dask(output_dataframe, write_index=True, compression="gzip")
class ProcessImages(Task): subset = BoolParameter(default=True) requires = Requires() cleaned_review = Requirement(DownloadImage) LOCAL_ROOT = os.path.abspath("data") SHARED_RELATIVE_PATH = "small" height = 100 width = 100 def requires(self): """:param image - requires DownloadImage function from data file :returns image and model dict""" return {"image": DownloadImage()} def output(self): return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" + self.SHARED_RELATIVE_PATH + "/") def run(self): os.makedirs(self.output().path) for img in glob.glob("data/images/*.jpg"): image_pil = Image.open(img, 'r') ratio_w = self.width / image_pil.width ratio_h = self.height / image_pil.height if ratio_w < ratio_h: # It must be fixed by width resize_width = self.width resize_height = round(ratio_w * image_pil.height) else: # Fixed by height resize_width = round(ratio_h * image_pil.width) resize_height = self.height image_resize = image_pil.resize((resize_width, resize_height), Image.ANTIALIAS) background = Image.new('RGB', (self.width, self.height), (255, 255, 255)) background.paste(image_resize, (round((self.width - resize_width) / 2), round((self.height - resize_height) / 2))) with self.output().atomic_provider(self.output().path + img.split('/')[-1]) as outfile: background.save(outfile, "JPEG", quality=80, optimize=True, progressive=True)
class EasyDockerTask(Task): runlocal = BoolParameter( significant=False ) # default is False. So it starts a container by default. def _cmd_params(self): l = [] for k, v in self.to_str_params().items(): if k == 'runlocal': continue l.append("--{}".format(k).replace("_", "-")) l.append(v) return l @property def command(self): cmd = [ "luigi", "--local-scheduler", self.task_family, "--module", self.task_module, "--runlocal", *self._cmd_params() ] return cmd def get_whole_env(self): return {k: v for k, v in os.environ.items()} def run(self): if self.runlocal: print("been there inside a container", self) Task.run(self) else: print("been there before calling docker stuff", self.command) docker_environment = self.get_whole_env() docker_environment[ "PYTHONPATH"] = "." #TODO isso pode deixar direto na imagem print("docker environment\n", docker_environment) import docker client = docker.from_env() client.containers.run( self.image, self.command, environment=docker_environment, stream=True, # detach=True, )
class Catalog(Task): force = BoolParameter(default=False) format = Parameter(default='html') parallel_workers = Parameter(default=4) def requires(self): return GenerateRST(force=self.force, format=self.format) def complete(self): return getattr(self, '_complete', False) def run(self): shell("cd catalog && make SPHINXOPTS='-j {0}' {1}".format( self.parallel_workers, self.format)) self._complete = True
class PnlEddyEpi(Task): eddy_epi_prefix = Parameter(default='') eddy_epi_bse_masked_prefix = Parameter(default='') eddy_epi_bse_betmask_prefix = Parameter(default='') debug = BoolParameter(default=False) epi_nproc = IntParameter(default=N_PROC) def requires(self): return dict(eddy=self.clone(PnlEddy), t2=self.clone(StructMask)) def run(self): for name in ['dwi', 'bval', 'bvec']: if not self.output()[name].exists(): cmd = (' ').join([ 'pnl_epi.py', '--dwi', self.input()['eddy']['dwi'], '--bvals', self.input()['eddy']['bval'], '--bvecs', self.input()['eddy']['bvec'], '--dwimask', self.input()['eddy']['mask'], '--bse', self.input()['eddy']['bse'], '--t2', self.input()['t2']['aligned'], '--t2mask', self.input()['t2']['mask'], '-o', self.eddy_epi_prefix, '-d' if self.debug else '', f'-n {self.epi_nproc}' if self.epi_nproc else '' ]) p = Popen(cmd, shell=True) p.wait() break self.dwi = self.output()['dwi'] self.bse_prefix = self.eddy_epi_bse_masked_prefix self.bse_betmask_prefix = self.eddy_epi_bse_betmask_prefix yield self.clone(BseBetMask) def output(self): dwi = self.eddy_epi_prefix.with_suffix('.nii.gz') bval = self.eddy_epi_prefix.with_suffix('.bval') bvec = self.eddy_epi_prefix.with_suffix('.bvec') bse = self.eddy_epi_bse_masked_prefix.with_suffix('.nii.gz') mask = _mask_name(self.eddy_epi_bse_betmask_prefix, self.slicer_exec, self.mask_qc) return dict(dwi=dwi, bval=bval, bvec=bvec, bse=bse, mask=mask)
class TestTaskChildB(TestBaseTask): index_for_grandchild = IntParameter() use_dynamic_dependency = BoolParameter() def register_required(self): if not self.use_dynamic_dependency: grandchild = self.create_child_task(task_class=TestTaskGrandchild, index_for_grandchild=self.index_for_grandchild) self.register_dependency(grandchild) def run_task(self): if self.use_dynamic_dependency: grandchild = self.create_child_task(task_class=TestTaskGrandchild, index_for_grandchild=self.index_for_grandchild) yield from self.run_dependencies(grandchild) def cleanup_task(self, success: bool): pass
class SyncMetadata(WrapperTask): no_force = BoolParameter(default=False, significant=False) def requires(self): for table in ( 'obs_table', 'obs_column', 'obs_column_table', 'obs_tag', 'obs_column_tag', 'obs_dump_version', 'obs_column_to_column', 'obs_meta', 'obs_meta_numer', 'obs_meta_denom', 'obs_meta_geom', 'obs_meta_timespan', 'obs_meta_geom_numer_timespan', 'obs_column_table_tile', ): if table == 'obs_meta': yield TableToCartoViaImportAPI(columns=[ 'numer_id', 'denom_id', 'geom_id', 'numer_name', 'denom_name', 'geom_name', 'numer_description', 'denom_description', 'geom_description', 'numer_aggregate', 'denom_aggregate', 'geom_aggregate', 'numer_type', 'denom_type', 'geom_type', 'numer_colname', 'denom_colname', 'geom_colname', 'numer_geomref_colname', 'denom_geomref_colname', 'geom_geomref_colname', 'numer_tablename', 'denom_tablename', 'geom_tablename', 'numer_timespan', 'denom_timespan', 'numer_weight', 'denom_weight', 'geom_weight', 'geom_timespan', 'numer_tags', 'denom_tags', 'geom_tags', 'timespan_tags', 'section_tags', 'subsection_tags', 'unit_tags', 'numer_extra', 'numer_ct_extra', 'denom_extra', 'denom_ct_extra', 'geom_extra', 'geom_ct_extra' ], table=table, force=not self.no_force) else: yield TableToCartoViaImportAPI(table=table, force=not self.no_force)
class MergeKallisto(Task): expt_id = Parameter() annot = BoolParameter(False) def requires(self): return { sample_id: Kallisto(sample_id=sample_id) for sample_id in get_samples(self.expt_id) } def output(self): prefix = '{}/{}/'.format(cfg['S3_BUCKET'], self.expt_id) out_dict = { 'est_counts': S3Target(prefix + 'est_counts.csv'), 'tpm': S3Target(prefix + 'tpm.csv') } if self.annot: out_dict['annotations'] = S3Target(prefix + 'annotations.csv') return out_dict def run(self): # Gather input filepaths and labels tgt_dict = self.input() sample_ids = list(tgt_dict.keys()) fpaths = [ tgt_dict[sample_id]['abundance'].path for sample_id in sample_ids ] # Merge columns annotations, est_counts = merge_column(fpaths, sample_ids, data_col='est_counts', annot=self.annot) annotations, tpm = merge_column(fpaths, sample_ids, data_col='tpm', annot=self.annot) if self.annot: csv_to_s3(annotations, self.output()['annotations'].path) csv_to_s3(est_counts, self.output()['est_counts'].path) csv_to_s3(tpm, self.output()['tpm'].path)
class Fs2Dwi(Task): fs_in_dwi = Parameter() debug = BoolParameter(default=False) mode = Parameter(default='direct') def requires(self): if self.mode == 'direct': return dict(fs_dir=self.clone(Freesurfer), corrected=self.clone(PnlEddy)) elif self.mode == 'witht2': fs_dir = self.clone(Freesurfer) self.struct_template = self.t2_template self.struct_align_prefix = self.t2_align_prefix self.mabs_mask_prefix = self.t2_mask_prefix self.csvFile = self.t2_csvFile self.model_img = self.t2_model_img self.model_mask = self.t2_model_mask corrected = self.clone(PnlEddyEpi) t2_attr = self.clone(StructMask) return dict(fs_dir=fs_dir, corrected=corrected, t2_attr=t2_attr) def run(self): cmd = (' ').join([ 'fs2dwi.py', '-f', self.input()['fs_dir'], '--bse', self.input()['corrected']['bse'], '--dwimask', self.input()['corrected']['mask'], '-o', self.fs_in_dwi.dirname, '-d' if self.debug else '', self.mode, '--t2 {} --t2mask {}'.format(self.input()['t2_attr']['aligned'], self.input()['t2_attr']['mask']) if self.mode == 'witht2' else '' ]) p = Popen(cmd, shell=True) p.wait() def output(self): return self.fs_in_dwi
class CleanedReviews(Task): __version__ = "1.0.0" subset = BoolParameter(default=True) requires = Requires() task2 = Requirement(YelpReviews) parquet_data = "./yelpdata/" output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget) def run(self): df = self.input()["task2"].read_dask(check_complete=True) df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)] values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0} df = df.fillna(value=values) df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int}) self.output().write_dask(collection=df, compression="gzip")
class OutputStorage(Task): LOCAL_ROOT = os.path.abspath("data") SHARED_RELATIVE_PATH = "storage" subset = BoolParameter(default=True) requires = Requires() reviews = Requirement(ContentImage) def output(self): return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" + self.SHARED_RELATIVE_PATH + "/") def run(self): os.makedirs(self.output().path) filename_pattern = os.path.join('./data/images', '*.jpg') dsk_images = dask_image.imread.imread(filename_pattern) da.to_hdf5('data/storage' + "/" + 'stored.hdf5', {'/x': dsk_images[0]})
class PnlEddy(Task): eddy_prefix = Parameter() eddy_bse_masked_prefix = Parameter() eddy_bse_betmask_prefix = Parameter() debug = BoolParameter(default=False) eddy_nproc = IntParameter(default=int(N_PROC)) def run(self): for name in ['dwi', 'bval', 'bvec']: if not self.output()[name].exists(): cmd = (' ').join([ 'pnl_eddy.py', '-i', self.input()['dwi'], '--bvals', self.input()['bval'], '--bvecs', self.input()['bvec'], '-o', self.eddy_prefix, '-d' if self.debug else '', f'-n {self.eddy_nproc}' if self.eddy_nproc else '' ]) p = Popen(cmd, shell=True) p.wait() break self.dwi = self.output()['dwi'] self.bse_prefix = self.eddy_bse_masked_prefix self.bse_betmask_prefix = self.eddy_bse_betmask_prefix yield self.clone(BseBetMask) def output(self): dwi = self.eddy_prefix.with_suffix('.nii.gz') bval = self.eddy_prefix.with_suffix('.bval') bvec = self.eddy_prefix.with_suffix('.bvec') bse = self.eddy_bse_masked_prefix.with_suffix('.nii.gz') mask = _mask_name(self.eddy_bse_betmask_prefix, self.slicer_exec, self.mask_qc) return dict(dwi=dwi, bval=bval, bvec=bvec, bse=bse, mask=mask)
class SyncData(WrapperTask): ''' Upload a single OBS table to cartodb by fuzzy ID ''' force = BoolParameter(default=True, significant=False) id = Parameter(default=None) exact_id = Parameter(default=None) tablename = Parameter(default=None) def requires(self): session = current_session() if self.exact_id: table = session.query(OBSTable).get(self.exact_id) elif self.tablename: table = session.query(OBSTable).filter( OBSTable.tablename == self.tablename).one() elif self.id: table = session.query(OBSTable).filter( OBSTable.id.ilike('%' + self.id + '%')).one() else: raise Exception('Need id or exact_id for SyncData') return TableToCarto(table=table.tablename, force=self.force)
class BySegment(Task): LOCAL_ROOT = os.path.abspath("data") SHARED_RELATIVE_PATH = "output" subset = BoolParameter(default=True) requires = Requires() reviews = Requirement(ProcessImages) def output(self): return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" + self.SHARED_RELATIVE_PATH, format=luigi.format.Nop) def run(self): os.makedirs(self.output().path) for img in glob.glob("data/images/*.jpg"): # cv_img = cv2.imread(img) gray = cv2.cvtColor(cv2.imread(img), cv2.COLOR_BGR2GRAY) binary = cv2.adaptiveThreshold(cv2.GaussianBlur(gray, (5, 5), 0), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) output = self.output().path + "/" + img.split('/')[-1] cv2.imwrite(output, binary)
class VaccineDataGlobalCleanupTask(Task): """Luigi Task to clean Vaccine time series data. The input is from External Task that specifies files in GIT. The cleaning from below code handles removing rows with null date and doses administered values are non-zero. The default parameters can be overridden for testing and I have overridden for all test cases. Parameters: subset: bool, True to process one partition, False to process the entire dataset default: True data_root: str, base directory to store cleaned output files Output: Dataframe stored in compressed Parquet format """ # default parameters subset = BoolParameter(default=True) data_root = Parameter(default="./data/vaccine/") # External task completion is required, to work with GIT / CSVTarget requires = Requires() input_data = Requirement(VaccineDataGlobalTask) # TargetOutput returns ParquetTarget output = TargetOutput( "{task.data_root}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", storage_options=None, ) def run(self): """ Clean Vaccine data from Task input and stores dataframe in Parquet format. :return: File content is stored in the data directory """ # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"] # are all integers. However, given there are missing values, you must first # read them as floats, fill nan's as 0, then convert to int. # You can provide a dict of {col: dtype} when providing the dtype arg in places like # read_parquet and astype. number_columns = [ "Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated", ] # Ensure that the date column is parsed as a pandas datetime using parse_dates vdg_dask = self.input()["input_data"].read_dask( parse_dates=["Date"], dtype={c: "float" for c in number_columns}) if self.subset: vdg_dask = vdg_dask.get_partition(0) # perform data cleaning # Remove any blank countries vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()] # Filter out invalid dates vdg_dask = vdg_dask[~vdg_dask.Date.isnull()] # You should set the index to Country_Region and ensure the output reads back with meaningful divisions # vdg_dask = vdg_dask.set_index("Country_Region") vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype( int) # write_dask parquet file output with gzip compression. vdg_output = vdg_dask self.output().write_dask(vdg_output, compression="gzip")
class CovidDataGlobalCleanupTask(Task): """Luigi Task to clean Covid time series data. The input is from External Task that specifies files in GIT. The cleaning from below code handles removing rows with null date and confirmed cases values are non-zero. The default parameters can be overridden for testing and I have overridden for all test cases. Parameters: subset: bool, True to process one partition, False to process the entire dataset default: True data_root: str, base directory to store cleaned output files Output: Dataframe stored in compressed Parquet format """ # default parameters subset = BoolParameter(default=True) data_root = Parameter(default="./data/covid/") # External task completion is required, to work with GIT / CSVTarget requires = Requires() input_data = Requirement(CovidDataGlobalTask) # TargetOutput returns ParquetTarget output = TargetOutput( "{task.data_root}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", storage_options=None, ) def run(self): """ Clean Covid data from Task input and stores dataframe in Parquet format. :return: File content is stored in the data directory """ # The columns [460 plus date data column contains confirmed covid cases numbers] # are all integers. However, given there are missing values, you must first # read them as floats, fill nan's as 0, then convert to int. # You can provide a dict of {col: dtype} when providing the dtype arg in places like # read_parquet and astype. est = timezone("EST") cur_date = datetime.datetime.now(est) logging.info(cur_date) number_of_days = ( cur_date - datetime.datetime.strptime("1/22/20", "%m/%d/%y").astimezone(est) ).days logging.info(number_of_days) number_columns = list() for days in range(1, number_of_days): number_columns.append( (datetime.datetime.now(est) - datetime.timedelta(days=days)).strftime( "%-m/%-d/%y" ) ) logging.info(number_columns) # Ensure that the date column is parsed as a pandas datetime using parse_dates cdg_dask = self.input()["input_data"].read_dask( dtype={c: "float" for c in number_columns} ) if self.subset: cdg_dask = cdg_dask.get_partition(0) # perform data cleaning # Remove any blank countries cdg_dask = cdg_dask[~cdg_dask[cdg_dask.columns[1]].isnull()] # You should set the index to Country_Region and ensure the output reads back with meaningful divisions # vdg_dask = vdg_dask.set_index("Country_Region") cdg_dask[number_columns] = cdg_dask[number_columns].fillna(0).astype(int) # write_dask parquet file output with gzip compression. cdg_output = cdg_dask self.output().write_dask(cdg_output, compression="gzip")
class GenerateRST(Task): force = BoolParameter(default=False) format = Parameter() section = Parameter(default=None) def __init__(self, *args, **kwargs): super(GenerateRST, self).__init__(*args, **kwargs) if self.force: shell('rm -rf catalog/source/*/*') def requires(self): requirements = {'meta': OBSMetaToLocal(force=True)} return requirements def output(self): tables = ['obs_meta', 'obs_meta_geom'] if not all([ PostgresTarget('observatory', t, non_empty=False).exists() for t in tables ]): return [] targets = {} session = current_session() resp = session.execute(''' WITH subquery AS (SELECT foo.geom_id, CASE WHEN foo.key LIKE 'section%' THEN foo.key ELSE NULL END section, CASE WHEN foo.key LIKE 'subsection%' THEN foo.key ELSE NULL END subsection FROM observatory.obs_meta_geom, LATERAL (SELECT geom_id, * FROM jsonb_each(geom_tags)) foo), subquery2 as (SELECT geom_id, REPLACE(MAX(section), 'section/', '') section, REPLACE(MAX(subsection), 'subsection/', '') subsection FROM subquery GROUP BY geom_id) SELECT DISTINCT UNNEST(section_tags), unnested.subsection_tags FROM observatory.obs_meta, LATERAL (SELECT UNNEST(subsection_tags) AS subsection_tags) unnested UNION ALL SELECT DISTINCT section, subsection FROM subquery2 WHERE section IS NOT NULL AND subsection IS NOT NULL ''') for section_id, subsection_id in resp: if self.section: if not section_id.startswith(self.section): continue targets[(section_id, subsection_id)] = LocalTarget( 'catalog/source/{section}/{subsection}.rst'.format( section=strip_tag_id(section_id), subsection=strip_tag_id(subsection_id))) targets[('licenses', None)] = LocalTarget('catalog/source/licenses.rst') targets[('sources', None)] = LocalTarget('catalog/source/sources.rst') return targets def template_globals(self): return {} def build_licenses(self, target): session = current_session() fhandle = target.open('w') fhandle.write( LICENSES_TEMPLATE.render(licenses=session.query(OBSTag).filter( OBSTag.type == 'license').order_by(OBSTag.name), **self.template_globals())) fhandle.close() def build_sources(self, target): session = current_session() fhandle = target.open('w') fhandle.write( SOURCES_TEMPLATE.render(sources=session.query(OBSTag).filter( OBSTag.type == 'source').order_by(OBSTag.name), **self.template_globals())) fhandle.close() def run(self): session = current_session() for section_subsection, target in self.output().items(): section_id, subsection_id = section_subsection if section_id == 'licenses': self.build_licenses(target) continue elif section_id == 'sources': self.build_sources(target) continue section = session.query(OBSTag).get(section_id) subsection = session.query(OBSTag).get(subsection_id) LOGGER.info('%s:', section_subsection) if subsection_id == 'tags.boundary': column_tree, all_columns = self._boundaries_tree( section_id, subsection_id) else: column_tree, all_columns = self._numerators_tree( section_id, subsection_id) subsection_path = 'catalog/source/{section}/{subsection}/'.format( section=strip_tag_id(section_id), subsection=strip_tag_id(subsection_id)) if not os.path.exists(subsection_path): os.makedirs(subsection_path) with open('catalog/source/{}.rst'.format(strip_tag_id(section_id)), 'w') \ as section_fhandle: section_fhandle.write( SECTION_TEMPLATE.render(section=section, **self.template_globals())) target.makedirs() fhandle = target.open('w') fhandle.write( SUBSECTION_TEMPLATE.render(subsection=subsection, format=self.format, **self.template_globals())) fhandle.close() if not all_columns: continue self._write_column_tree( [strip_tag_id(section_id), strip_tag_id(subsection_id)], column_tree, all_columns) def _boundaries_tree(self, section_id, subsection_id): boundaries_list_result = current_session().execute(''' SELECT DISTINCT c.id FROM observatory.obs_tag section_t, observatory.obs_column_tag section_ct, observatory.obs_tag subsection_t, observatory.obs_column_tag subsection_ct, observatory.obs_column c WHERE section_t.id = section_ct.tag_id AND subsection_t.id = subsection_ct.tag_id AND c.id = section_ct.column_id AND c.id = subsection_ct.column_id AND subsection_t.id = '{subsection_id}' AND section_t.id = '{section_id}' AND subsection_t.type = 'subsection' AND section_t.type = 'section' GROUP BY c.id ORDER BY c.id '''.format(section_id=section_id, subsection_id=subsection_id)) boundary_ids = [row[0] for row in boundaries_list_result.fetchall()] boundaries_detail_result = current_session().execute(''' SELECT c.id, FIRST(c.name), FIRST(c.description), FIRST(c.type), FIRST(ctab.extra), FIRST(c.aggregate), JSONB_Object_Agg(t.type || '/' || t.id, t.name), 'name' suggested_name, ARRAY_AGG(DISTINCT tab.timespan) timespan, ARRAY[]::Text[] denoms, ARRAY[]::Text[], ST_AsText(ST_Envelope(FIRST(tab.the_geom))) envelope FROM observatory.obs_column c, observatory.obs_column_tag ct, observatory.obs_tag t, observatory.obs_column_table ctab, observatory.obs_table tab WHERE c.id = ANY(ARRAY['{}']) AND ct.column_id = c.id AND ct.tag_id = t.id AND c.id = ctab.column_id AND tab.id = ctab.table_id GROUP BY 1, 8 '''.format("', '".join(boundary_ids))) boundary_data = self._parse_columns(boundaries_detail_result) return {k: {} for k in list(boundary_data.keys())}, boundary_data def _numerators_tree(self, section_id, subsection_id): numerator_paths_result = current_session().execute(''' WITH RECURSIVE children(numer_id, path) AS ( SELECT numer_id, ARRAY[]::Text[] FROM observatory.obs_meta_numer children WHERE numer_tags ? 'subsection/{subsection_id}' AND numer_tags ? 'section/{section_id}' AND numer_weight > 0 UNION SELECT parent.denom_id, children.numer_id || children.path FROM observatory.obs_meta parent, children WHERE parent.numer_id = children.numer_id ) SELECT path from children WHERE numer_id IS NULL; '''.format(section_id=section_id, subsection_id=subsection_id)) numerator_tree = {} numerator_ids = set() for row in numerator_paths_result.fetchall(): node = numerator_tree for mid in row[0]: numerator_ids.add(mid) if mid not in node: node[mid] = {} node = node[mid] numerator_details_result = current_session().execute(''' SELECT numer_id, numer_name, numer_description, numer_type, numer_extra, numer_aggregate, numer_tags, numer_colname suggested_name, ARRAY[numer_timespan] timespan, ARRAY_AGG(DISTINCT ARRAY[ denom_reltype, denom_id, denom_name ]) denoms, ARRAY_AGG(DISTINCT ARRAY[ geom_id, geom_name, numer_timespan, geom_tags::Text ]) geom_timespans, FIRST(ST_AsText(ST_Envelope(the_geom))) envelope FROM observatory.obs_meta WHERE numer_id = ANY (ARRAY['{}']) GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9 '''.format("', '".join(numerator_ids))) return numerator_tree, self._parse_columns(numerator_details_result) def _parse_columns(self, all_columns_result): all_columns = {} for col in all_columns_result: geom_timespans = {} for gt in col[10]: if gt[0] in geom_timespans: geom_timespans[gt[0]]['timespans'].append(gt[2]) else: geom_timespans[gt[0]] = { 'geom_id': gt[0], 'geom_name': gt[1], 'timespans': [gt[2]], 'geom_tags': json.loads(gt[3]) } all_columns[col[0]] = { 'id': col[0], 'latlng': catalog_latlng(col[0]), 'name': col[1], 'description': col[2], 'type': col[3], 'extra': col[4], 'aggregate': col[5], 'tags': col[6], 'suggested_name': col[7], 'timespan': col[8], 'timespan_sluggified': underscore_slugify('_'.join(col[8])), 'licenses': [ tag_id.split('/')[1] for tag_id, tag_name in col[6].items() if tag_id.startswith('license/') ], 'sources': [ tag_id.split('/')[1] for tag_id, tag_name in col[6].items() if tag_id.startswith('source/') ], 'denoms': col[9], 'geom_timespans': geom_timespans, 'envelope': col[11] } return all_columns def _write_column_tree(self, path, tree, all_columns): for column_id, subtree in tree.items(): column_path = path + [column_id] self._write_column(column_path, all_columns[column_id], len(subtree)) if subtree: os.makedirs('catalog/source/' + '/'.join(column_path)) self._write_column_tree(column_path, subtree, all_columns) def _write_column(self, path, column, numchildren): with open('catalog/source/{path}.rst'.format(path='/'.join(path)), 'w') as column_file: column_file.write( COLUMN_TEMPLATE.render(intermediate_path='/'.join(path[:-1]), numchildren=numchildren, col=column, **self.template_globals()))