Example #1
0
class TestTaskBase(TestBaseTask):
    different_grandchild = BoolParameter()
    use_dynamic_dependency = BoolParameter()

    def _build_child_task(self, use_dynamic_dependency):
        index_for_grandchild = 0
        child_a = self.create_child_task(task_class=TestTaskChildA,
                                         index_for_grandchild=index_for_grandchild,
                                         use_dynamic_dependency=use_dynamic_dependency)
        if self.different_grandchild:
            index_for_grandchild = 1
        child_b = self.create_child_task(task_class=TestTaskChildB,
                                         index_for_grandchild=index_for_grandchild,
                                         use_dynamic_dependency=use_dynamic_dependency)
        return child_a, child_b

    def register_required(self):
        if not self.use_dynamic_dependency:
            child_a, child_b = self._build_child_task(False)
            self.child_tasks = self.register_dependencies([child_a, child_b])

    def run_task(self):
        if self.use_dynamic_dependency:
            child_a, child_b = self._build_child_task(True)
            yield from self.run_dependencies([child_a, child_b])
Example #2
0
class PlotSearchLines(FigureMaker):
    num_delays: Sequence[int]
    labels: List[str]
    legend_title: Optional[str] = CustomParameter(default=None)
    sequential_colors = BoolParameter(default=False)
    with_reference = BoolParameter(default=False)
    legend_outside = True
    linestyle = ".-"
    reference_color = "silver"

    @property
    def colors(self) -> List:
        num_colors = len(self.labels)
        if self.sequential_colors:
            cmap = get_cmap("viridis", num_colors)
            return list(cmap.colors)
        else:
            return [f"C{i}" for i in range(num_colors)]

    def work(self):
        fig, axes = subplots(nrows=2, sharex=True, figsize=paperfig())
        ax_top, ax_btm = axes
        ax_btm.set_xlabel("Number of delays")
        self.plot_on_axes(ax_top, ax_btm)
        self.add_legend(ax_top)
        fig.tight_layout()
        self.output().write(fig)

    def add_legend(self, ax):
        if self.legend_outside:
            loc_kwargs = dict(loc="center left", bbox_to_anchor=(0.98, 0.5))
        else:
            loc_kwargs = dict(loc="best")
        labels = self.labels
        colors = self.colors
        if self.with_reference:
            labels += ["Proposed online BPF"]
            colors += [self.reference_color]
        add_colored_legend(
            parent=ax,
            labels=labels,
            colors=colors,
            title=self.legend_title,
            **loc_kwargs,
        )

    @abstractmethod
    def plot_on_axes(self, ax_top: Axes, ax_btm: Axes):
        ...
Example #3
0
class CleanedReviews(Task):
    subset = BoolParameter(default=True)

    # Output should be a local ParquetTarget in ./data, ideally a salted output,
    # and with the subset parameter either reflected via salted output or
    # as part of the directory structure
    requires = csci_utils_req.Requires()
    other = csci_utils_req.Requirement(YelpReviews)
    print(other)
    path = os.path.abspath('data/subset') + '/'

    output = csci_utils_req.TargetOutput(
        file_pattern=path,
        target_class=csci_task.ParquetTarget,
        ext="",
        storage_options=dict(requester_pays=True))

    def run(self):

        numcols = ["funny", "cool", "useful", "stars"]
        dsk = self.input().read_dask(self)
        print(dsk)

        if self.subset:
            dsk = dsk.get_partition(0)

        out = ...
        self.output().write_dask(out, compression='gzip')
class SyncAllData(WrapperTask):
    '''
    Sync all data to the linked CARTO account.
    '''

    force = BoolParameter(default=False, significant=False)

    def requires(self):
        existing_table_versions = dict([
            (r['tablename'], r['version'])
            for r in query_cartodb('SELECT * FROM obs_table').json()['rows']
        ])
        tables = dict([(k, v) for k, v in current_session().execute('''
            SELECT tablename, t.version
            FROM observatory.obs_table t,
                 observatory.obs_column_table ct,
                 observatory.obs_column c
            WHERE t.id = ct.table_id
              AND c.id = ct.column_id
              AND t.tablename NOT IN ('obs_ffebc3eb689edab4faa757f75ca02c65d7db7327')
              AND c.weight > 0
            ''')])

        for tablename, version in tables.items():
            if version > existing_table_versions.get(tablename):
                force = True
            else:
                force = self.force
            yield TableToCartoViaImportAPI(table=tablename, force=force)
Example #5
0
class PipelineTask(BatchTask):

    local = BoolParameter(False, significant=False)

    @property
    def job_name(self):
        return '{}-{}'.format(self.task_family, self.sample_id)
class DumpS3(Task):
    '''
    Uploads ``observatory`` schema dumped from :class:`~.carto.Dump` to
    `Amazon S3 <https://aws.amazon.com/s3/>`_, using credentials from ``.env``.

    Automatically updates :class:`~.meta.OBSDumpVersion`.

    :param timestamp: Optional date parameter, defaults to today.
    '''
    timestamp = DateParameter(default=date.today())
    force = BoolParameter(default=False, significant=False)

    def requires(self):
        return Dump(timestamp=self.timestamp)

    def run(self):
        shell('aws s3 cp {input} {output}'.format(input=self.input().path,
                                                  output=self.output().path))

    def output(self):
        path = self.input().path.replace('tmp/carto/Dump_', 'do-release-')
        path = path.replace('.dump', '/obs.dump')
        path = 's3://cartodb-observatory-data/{path}'.format(path=path)
        LOGGER.info(path)
        target = S3Target(path)
        if self.force:
            shell('aws s3 rm {output}'.format(output=path))
            self.force = False
        return target
Example #7
0
class AccountDimLoad(Task):
    LOCAL_ROOT = "./data/accountdim/"
    """
    This task outputs a local ParquetTarget in ./data/accountdim/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    account_dim = Requirement(AccountDimVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "CUST_ACCT": "object",
            "SEGMENT_NAME": "object",
            "SVC_PLAN": "object",
        }
        dsk = self.input()["account_dim"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
Example #8
0
class LineDimLoad(Task):
    LOCAL_ROOT = "./data/linedim/"
    """
    This task outputs a local ParquetTarget in ./data/linedim/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    line_dim = Requirement(LineDimVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "MDN": "object",
            "DEVICE_GROUPING": "object",
            "SALES_CHANNEL": "object",
        }
        dsk = self.input()["line_dim"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
Example #9
0
class LimitFactLoad(Task):
    LOCAL_ROOT = "./data/limitfact/"
    """
    This task outputs a local ParquetTarget in ./data/limitfact/ for further load into the database.
    """

    subset = BoolParameter(default=True)
    requires = Requires()
    limit_fact = Requirement(LimitFactVer)
    output = TargetOutput(target_class=ParquetTarget,
                          file_pattern=LOCAL_ROOT,
                          ext="")

    def run(self):
        dtype_dic = {
            "MTN": "object",
            "CUST_ACCT": "object",
            "LIMITING_DT": "object",
            "LIMIT_TYPE": "object",
        }
        dsk = self.input()["limit_fact"].read_dask(dtype=dtype_dic)

        if self.subset:
            dsk = dsk.get_partition(0)

        self.output().write_dask(dsk, compression="gzip", compute=True)
class ETLAnalysisPrint(Task):
    """
    ETL analysis class to read_dask, compute and print.  This is implemented by the
    different analysis tasks to just do that.  Last bit of work for each task to do the
    computation and print.

    Parameters:
        subset: bool, used to subset true or false, default: True
        analysis_path: str, final results are stored as parquet files here

    Output:
        print the analysis dataframe for visual
    """

    # Default parameters
    subset = BoolParameter(default=True)
    analysis_path = Parameter(default="./data/vaccine/")

    requires = Requires()

    def complete(self):
        """
        Does really nothing and returns false.
        """
        return False

    def run(self):
        """
        Read the dask, compute and print.
        """
        analysis_output_dataframe = self.input()["input_data"].read_dask()
        logging.info(analysis_output_dataframe.compute())
Example #11
0
class CleanedReviews(Task):
    subset = BoolParameter(default=True)

    # Output should be a local ParquetTarget in ./data, ideally a salted output,
    # and with the subset parameter either reflected via salted output or
    # as part of the directory structure

    requires = Requires()
    reviews = Requirement(YelpReviews).__module__

    output = TargetOutput(file_pattern='data/',
                          target_class=ParquetTarget,
                          glob='*.parquet')

    def run(self):

        numcols = ["funny", "cool", "useful", "stars"]
        strcols = ['review_id', 'user_id', 'business_id', 'text']
        dsk = YelpReviews.output(self)
        dsk[numcols] = dsk[numcols].astype("float64")
        dsk = dsk.fillna(0)
        dsk['date'] = dsk['date'].astype('datetime64')
        dsk = dsk[dsk.review_id.apply(lambda x: len(str(x)) == 22)]

        #dsk = dsk.set_index(dsk.review_id)
        dsk[numcols] = dsk[numcols].astype("int64")

        if self.subset:
            dsk = dsk.get_partition(0)

        dsk[strcols] = dsk[strcols].astype("str")

        self.output().write_dask(collection=dsk, compression='gzip')
Example #12
0
class PDFCatalogToS3(Task):

    timestamp = DateParameter(default=date.today())
    force = BoolParameter(significant=False)

    def __init__(self, **kwargs):
        if kwargs.get('force'):
            try:
                shell('aws s3 rm s3://data-observatory/observatory.pdf')
            except:
                pass
        super(PDFCatalogToS3, self).__init__()

    def run(self):
        for target in self.output():
            shell('aws s3 cp catalog/build/observatory.pdf {output} '
                  '--acl public-read'.format(output=target.path))

    def output(self):
        return [
            S3Target('s3://data-observatory/observatory.pdf'),
            S3Target(
                's3://data-observatory/observatory-{timestamp}.pdf'.format(
                    timestamp=self.timestamp)),
        ]
Example #13
0
class BseBetMask(Task):
    bse_betmask_prefix = Parameter(default='')
    bet_threshold = FloatParameter(default=float(BET_THRESHOLD))
    slicer_exec = Parameter(default='')
    mask_qc = BoolParameter(default=False)

    def run(self):

        bet_mask = self.bse_betmask_prefix._path + '_mask.nii.gz'

        if not isfile(bet_mask):
            cmd = (' ').join([
                'bet_mask.py', '-i',
                self.input(), '-o', self.bse_betmask_prefix,
                f'-f {self.bet_threshold}' if self.bet_threshold else ''
            ])
            p = Popen(cmd, shell=True)
            p.wait()

            if p.returncode:
                return

        # mask the baseline image
        cmd = (' ').join([
            'ImageMath', '3',
            self.output()['bse'], 'm',
            self.output()['bse'], bet_mask
        ])
        p = Popen(cmd, shell=True)
        p.wait()

        if self.slicer_exec or self.mask_qc:
            print(
                '\n\n** Check quality of created mask {} . Once you are done, save the (edited) mask as {} **\n\n'
                .format(bet_mask,
                        self.output()['mask']))

        if self.slicer_exec:
            cmd = (' ').join([
                self.slicer_exec, '--python-code',
                '\"slicer.util.loadVolume(\'{}\'); '
                'slicer.util.loadLabelVolume(\'{}\')\"'.format(
                    self.input(), bet_mask)
            ])

            p = Popen(cmd, shell=True)
            p.wait()

        elif self.mask_qc:
            while 1:
                sleep(QC_POLL)
                if isfile(self.bse_betmask_prefix._path + 'Qc_mask.nii.gz'):
                    break

    def output(self):
        mask = _mask_name(self.bse_betmask_prefix, self.slicer_exec,
                          self.mask_qc)
        return dict(bse=self.input(), mask=mask)
Example #14
0
class ETLAnalysis(Task):
    """Created an abstract class for conducting analysis of covid data
    at different levels - by country, by year, by month and by week.  This is a luigi
    task and sub-classed by the different levels of covid data analysis tasks.  The analysis
    abstract class requires Cleanup and the parquet files for performing
    the analysis and display.

    This abstract class has one analysis method to override / implement in their
    respective tasks.

    Each analysis should be a separate Luigi task, which computes its analysis and writes
    the result to parquet. To display to the terminal or answer a quiz, the output should
    be read back from the written parquet file.

    Parameters:
        subset: bool, True to process just one partition, False to process
            the entire dataset, default: True
        analysis_path: str, base directory to store output files

    Output:
        Dataframe stored in compressed Parquet format in
            {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/
    """

    subset = BoolParameter(default=True)
    analysis_path = Parameter(default="./data/covid/")

    requires = Requires()
    input_data = Requirement(CovidDataGlobalCleanupTask)

    # the output references a "sub_dir" parameter, which is expected to be defined
    # in a subclass
    output = TargetOutput(
        "{task.analysis_path}{task.sub_dir}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
    )

    def perform_analysis(self, df):
        """ this method will be implemented by sub-classes. """
        raise NotImplementedError

    def run(self):
        """
        Uses the data points we need for analysis -> Country_Region and Date
        calls the implemented perform_analysis method to do the calculations
        """
        analysis_dataframe = self.input()["input_data"].read_dask()

        # invoke perform_analysis from the implemented sub-classes
        # only gets the aggregated analysis column and the calculated column
        output_dataframe = self.perform_analysis(analysis_dataframe)
        # write_dask parquet file output with gzip compression.
        self.output().write_dask(output_dataframe, write_index=True, compression="gzip")
Example #15
0
class ProcessImages(Task):
    subset = BoolParameter(default=True)
    requires = Requires()
    cleaned_review = Requirement(DownloadImage)
    LOCAL_ROOT = os.path.abspath("data")
    SHARED_RELATIVE_PATH = "small"
    height = 100
    width = 100

    def requires(self):
        """:param image - requires DownloadImage function from data file
        :returns image and model dict"""
        return {"image": DownloadImage()}

    def output(self):

        return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" +
                                           self.SHARED_RELATIVE_PATH + "/")

    def run(self):

        os.makedirs(self.output().path)
        for img in glob.glob("data/images/*.jpg"):
            image_pil = Image.open(img, 'r')
            ratio_w = self.width / image_pil.width
            ratio_h = self.height / image_pil.height
            if ratio_w < ratio_h:
                # It must be fixed by width
                resize_width = self.width
                resize_height = round(ratio_w * image_pil.height)
            else:
                # Fixed by height
                resize_width = round(ratio_h * image_pil.width)
                resize_height = self.height
            image_resize = image_pil.resize((resize_width, resize_height),
                                            Image.ANTIALIAS)
            background = Image.new('RGB', (self.width, self.height),
                                   (255, 255, 255))
            background.paste(image_resize,
                             (round((self.width - resize_width) / 2),
                              round((self.height - resize_height) / 2)))
            with self.output().atomic_provider(self.output().path +
                                               img.split('/')[-1]) as outfile:
                background.save(outfile,
                                "JPEG",
                                quality=80,
                                optimize=True,
                                progressive=True)
Example #16
0
class EasyDockerTask(Task):
    runlocal = BoolParameter(
        significant=False
    )  # default is False. So it starts a container by default.

    def _cmd_params(self):
        l = []
        for k, v in self.to_str_params().items():
            if k == 'runlocal':
                continue
            l.append("--{}".format(k).replace("_", "-"))
            l.append(v)
        return l

    @property
    def command(self):
        cmd = [
            "luigi", "--local-scheduler", self.task_family, "--module",
            self.task_module, "--runlocal", *self._cmd_params()
        ]
        return cmd

    def get_whole_env(self):
        return {k: v for k, v in os.environ.items()}

    def run(self):
        if self.runlocal:
            print("been there inside a container", self)

            Task.run(self)
        else:
            print("been there before calling docker stuff", self.command)

            docker_environment = self.get_whole_env()
            docker_environment[
                "PYTHONPATH"] = "."  #TODO isso pode deixar direto na imagem
            print("docker environment\n", docker_environment)

            import docker
            client = docker.from_env()

            client.containers.run(
                self.image,
                self.command,
                environment=docker_environment,
                stream=True,
                # detach=True,
            )
Example #17
0
class Catalog(Task):

    force = BoolParameter(default=False)
    format = Parameter(default='html')
    parallel_workers = Parameter(default=4)

    def requires(self):
        return GenerateRST(force=self.force, format=self.format)

    def complete(self):
        return getattr(self, '_complete', False)

    def run(self):
        shell("cd catalog && make SPHINXOPTS='-j {0}' {1}".format(
            self.parallel_workers, self.format))
        self._complete = True
Example #18
0
class PnlEddyEpi(Task):
    eddy_epi_prefix = Parameter(default='')
    eddy_epi_bse_masked_prefix = Parameter(default='')
    eddy_epi_bse_betmask_prefix = Parameter(default='')
    debug = BoolParameter(default=False)
    epi_nproc = IntParameter(default=N_PROC)

    def requires(self):
        return dict(eddy=self.clone(PnlEddy), t2=self.clone(StructMask))

    def run(self):

        for name in ['dwi', 'bval', 'bvec']:
            if not self.output()[name].exists():
                cmd = (' ').join([
                    'pnl_epi.py', '--dwi',
                    self.input()['eddy']['dwi'], '--bvals',
                    self.input()['eddy']['bval'], '--bvecs',
                    self.input()['eddy']['bvec'], '--dwimask',
                    self.input()['eddy']['mask'], '--bse',
                    self.input()['eddy']['bse'], '--t2',
                    self.input()['t2']['aligned'], '--t2mask',
                    self.input()['t2']['mask'], '-o', self.eddy_epi_prefix,
                    '-d' if self.debug else '',
                    f'-n {self.epi_nproc}' if self.epi_nproc else ''
                ])
                p = Popen(cmd, shell=True)
                p.wait()

                break

        self.dwi = self.output()['dwi']
        self.bse_prefix = self.eddy_epi_bse_masked_prefix
        self.bse_betmask_prefix = self.eddy_epi_bse_betmask_prefix
        yield self.clone(BseBetMask)

    def output(self):
        dwi = self.eddy_epi_prefix.with_suffix('.nii.gz')
        bval = self.eddy_epi_prefix.with_suffix('.bval')
        bvec = self.eddy_epi_prefix.with_suffix('.bvec')
        bse = self.eddy_epi_bse_masked_prefix.with_suffix('.nii.gz')

        mask = _mask_name(self.eddy_epi_bse_betmask_prefix, self.slicer_exec,
                          self.mask_qc)

        return dict(dwi=dwi, bval=bval, bvec=bvec, bse=bse, mask=mask)
Example #19
0
class TestTaskChildB(TestBaseTask):
    index_for_grandchild = IntParameter()
    use_dynamic_dependency = BoolParameter()

    def register_required(self):
        if not self.use_dynamic_dependency:
            grandchild = self.create_child_task(task_class=TestTaskGrandchild,
                                                index_for_grandchild=self.index_for_grandchild)
            self.register_dependency(grandchild)

    def run_task(self):
        if self.use_dynamic_dependency:
            grandchild = self.create_child_task(task_class=TestTaskGrandchild,
                                                index_for_grandchild=self.index_for_grandchild)
            yield from self.run_dependencies(grandchild)

    def cleanup_task(self, success: bool):
        pass
class SyncMetadata(WrapperTask):

    no_force = BoolParameter(default=False, significant=False)

    def requires(self):
        for table in (
                'obs_table',
                'obs_column',
                'obs_column_table',
                'obs_tag',
                'obs_column_tag',
                'obs_dump_version',
                'obs_column_to_column',
                'obs_meta',
                'obs_meta_numer',
                'obs_meta_denom',
                'obs_meta_geom',
                'obs_meta_timespan',
                'obs_meta_geom_numer_timespan',
                'obs_column_table_tile',
        ):
            if table == 'obs_meta':
                yield TableToCartoViaImportAPI(columns=[
                    'numer_id', 'denom_id', 'geom_id', 'numer_name',
                    'denom_name', 'geom_name', 'numer_description',
                    'denom_description', 'geom_description', 'numer_aggregate',
                    'denom_aggregate', 'geom_aggregate', 'numer_type',
                    'denom_type', 'geom_type', 'numer_colname',
                    'denom_colname', 'geom_colname', 'numer_geomref_colname',
                    'denom_geomref_colname', 'geom_geomref_colname',
                    'numer_tablename', 'denom_tablename', 'geom_tablename',
                    'numer_timespan', 'denom_timespan', 'numer_weight',
                    'denom_weight', 'geom_weight', 'geom_timespan',
                    'numer_tags', 'denom_tags', 'geom_tags', 'timespan_tags',
                    'section_tags', 'subsection_tags', 'unit_tags',
                    'numer_extra', 'numer_ct_extra', 'denom_extra',
                    'denom_ct_extra', 'geom_extra', 'geom_ct_extra'
                ],
                                               table=table,
                                               force=not self.no_force)
            else:
                yield TableToCartoViaImportAPI(table=table,
                                               force=not self.no_force)
Example #21
0
class MergeKallisto(Task):

    expt_id = Parameter()
    annot = BoolParameter(False)

    def requires(self):
        return {
            sample_id: Kallisto(sample_id=sample_id)
            for sample_id in get_samples(self.expt_id)
        }

    def output(self):
        prefix = '{}/{}/'.format(cfg['S3_BUCKET'], self.expt_id)
        out_dict = {
            'est_counts': S3Target(prefix + 'est_counts.csv'),
            'tpm': S3Target(prefix + 'tpm.csv')
        }
        if self.annot:
            out_dict['annotations'] = S3Target(prefix + 'annotations.csv')
        return out_dict

    def run(self):
        # Gather input filepaths and labels
        tgt_dict = self.input()
        sample_ids = list(tgt_dict.keys())
        fpaths = [
            tgt_dict[sample_id]['abundance'].path for sample_id in sample_ids
        ]

        # Merge columns
        annotations, est_counts = merge_column(fpaths,
                                               sample_ids,
                                               data_col='est_counts',
                                               annot=self.annot)
        annotations, tpm = merge_column(fpaths,
                                        sample_ids,
                                        data_col='tpm',
                                        annot=self.annot)

        if self.annot:
            csv_to_s3(annotations, self.output()['annotations'].path)
        csv_to_s3(est_counts, self.output()['est_counts'].path)
        csv_to_s3(tpm, self.output()['tpm'].path)
Example #22
0
class Fs2Dwi(Task):

    fs_in_dwi = Parameter()
    debug = BoolParameter(default=False)
    mode = Parameter(default='direct')

    def requires(self):
        if self.mode == 'direct':
            return dict(fs_dir=self.clone(Freesurfer),
                        corrected=self.clone(PnlEddy))

        elif self.mode == 'witht2':
            fs_dir = self.clone(Freesurfer)

            self.struct_template = self.t2_template
            self.struct_align_prefix = self.t2_align_prefix
            self.mabs_mask_prefix = self.t2_mask_prefix
            self.csvFile = self.t2_csvFile
            self.model_img = self.t2_model_img
            self.model_mask = self.t2_model_mask

            corrected = self.clone(PnlEddyEpi)
            t2_attr = self.clone(StructMask)

            return dict(fs_dir=fs_dir, corrected=corrected, t2_attr=t2_attr)

    def run(self):
        cmd = (' ').join([
            'fs2dwi.py', '-f',
            self.input()['fs_dir'], '--bse',
            self.input()['corrected']['bse'], '--dwimask',
            self.input()['corrected']['mask'], '-o', self.fs_in_dwi.dirname,
            '-d' if self.debug else '', self.mode,
            '--t2 {} --t2mask {}'.format(self.input()['t2_attr']['aligned'],
                                         self.input()['t2_attr']['mask'])
            if self.mode == 'witht2' else ''
        ])
        p = Popen(cmd, shell=True)
        p.wait()

    def output(self):
        return self.fs_in_dwi
Example #23
0
class CleanedReviews(Task):
    __version__ = "1.0.0"
    subset = BoolParameter(default=True)
    requires = Requires()
    task2 = Requirement(YelpReviews)
    parquet_data = "./yelpdata/"

    output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget)

    def run(self):

        df = self.input()["task2"].read_dask(check_complete=True)


        df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)]
        values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0}
        df = df.fillna(value=values)
        df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int})

        self.output().write_dask(collection=df, compression="gzip")
Example #24
0
class OutputStorage(Task):

    LOCAL_ROOT = os.path.abspath("data")
    SHARED_RELATIVE_PATH = "storage"
    subset = BoolParameter(default=True)
    requires = Requires()
    reviews = Requirement(ContentImage)

    def output(self):

        return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" +
                                           self.SHARED_RELATIVE_PATH + "/")

    def run(self):
        os.makedirs(self.output().path)
        filename_pattern = os.path.join('./data/images', '*.jpg')

        dsk_images = dask_image.imread.imread(filename_pattern)

        da.to_hdf5('data/storage' + "/" + 'stored.hdf5', {'/x': dsk_images[0]})
Example #25
0
class PnlEddy(Task):
    eddy_prefix = Parameter()
    eddy_bse_masked_prefix = Parameter()
    eddy_bse_betmask_prefix = Parameter()
    debug = BoolParameter(default=False)
    eddy_nproc = IntParameter(default=int(N_PROC))

    def run(self):

        for name in ['dwi', 'bval', 'bvec']:
            if not self.output()[name].exists():
                cmd = (' ').join([
                    'pnl_eddy.py', '-i',
                    self.input()['dwi'], '--bvals',
                    self.input()['bval'], '--bvecs',
                    self.input()['bvec'], '-o', self.eddy_prefix,
                    '-d' if self.debug else '',
                    f'-n {self.eddy_nproc}' if self.eddy_nproc else ''
                ])
                p = Popen(cmd, shell=True)
                p.wait()

                break

        self.dwi = self.output()['dwi']
        self.bse_prefix = self.eddy_bse_masked_prefix
        self.bse_betmask_prefix = self.eddy_bse_betmask_prefix
        yield self.clone(BseBetMask)

    def output(self):
        dwi = self.eddy_prefix.with_suffix('.nii.gz')
        bval = self.eddy_prefix.with_suffix('.bval')
        bvec = self.eddy_prefix.with_suffix('.bvec')
        bse = self.eddy_bse_masked_prefix.with_suffix('.nii.gz')

        mask = _mask_name(self.eddy_bse_betmask_prefix, self.slicer_exec,
                          self.mask_qc)

        return dict(dwi=dwi, bval=bval, bvec=bvec, bse=bse, mask=mask)
class SyncData(WrapperTask):
    '''
    Upload a single OBS table to cartodb by fuzzy ID
    '''
    force = BoolParameter(default=True, significant=False)
    id = Parameter(default=None)
    exact_id = Parameter(default=None)
    tablename = Parameter(default=None)

    def requires(self):
        session = current_session()
        if self.exact_id:
            table = session.query(OBSTable).get(self.exact_id)
        elif self.tablename:
            table = session.query(OBSTable).filter(
                OBSTable.tablename == self.tablename).one()
        elif self.id:
            table = session.query(OBSTable).filter(
                OBSTable.id.ilike('%' + self.id + '%')).one()
        else:
            raise Exception('Need id or exact_id for SyncData')
        return TableToCarto(table=table.tablename, force=self.force)
Example #27
0
class BySegment(Task):
    LOCAL_ROOT = os.path.abspath("data")
    SHARED_RELATIVE_PATH = "output"
    subset = BoolParameter(default=True)
    requires = Requires()
    reviews = Requirement(ProcessImages)

    def output(self):
        return SuffixPreservingLocalTarget(self.LOCAL_ROOT + "/" +
                                           self.SHARED_RELATIVE_PATH,
                                           format=luigi.format.Nop)

    def run(self):

        os.makedirs(self.output().path)

        for img in glob.glob("data/images/*.jpg"):
            # cv_img = cv2.imread(img)
            gray = cv2.cvtColor(cv2.imread(img), cv2.COLOR_BGR2GRAY)
            binary = cv2.adaptiveThreshold(cv2.GaussianBlur(gray, (5, 5), 0),
                                           255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                           cv2.THRESH_BINARY, 11, 2)
            output = self.output().path + "/" + img.split('/')[-1]
            cv2.imwrite(output, binary)
class VaccineDataGlobalCleanupTask(Task):
    """Luigi Task to clean Vaccine time series data. The input is from
    External Task that specifies files in GIT. The cleaning from below code handles
    removing rows with null date and doses administered values are non-zero.
    The default parameters can be overridden for testing and I have overridden for
    all test cases.

    Parameters:
        subset: bool, True to process one partition, False to process the entire dataset
                    default: True
        data_root: str, base directory to store cleaned output files

    Output:
        Dataframe stored in compressed Parquet format

    """

    # default parameters
    subset = BoolParameter(default=True)
    data_root = Parameter(default="./data/vaccine/")

    # External task completion is required, to work with GIT / CSVTarget
    requires = Requires()
    input_data = Requirement(VaccineDataGlobalTask)

    # TargetOutput returns ParquetTarget
    output = TargetOutput(
        "{task.data_root}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
        storage_options=None,
    )

    def run(self):
        """
        Clean Vaccine data from Task input and stores dataframe in Parquet format.

        :return:
            File content is stored in the data directory
        """

        # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"]
        # are all integers. However, given there are missing values, you must first
        # read them as floats, fill nan's as 0, then convert to int.
        # You can provide a dict of {col: dtype} when providing the dtype arg in places like
        # read_parquet and astype.
        number_columns = [
            "Doses_admin",
            "People_partially_vaccinated",
            "People_fully_vaccinated",
        ]
        # Ensure that the date column is parsed as a pandas datetime using parse_dates
        vdg_dask = self.input()["input_data"].read_dask(
            parse_dates=["Date"], dtype={c: "float"
                                         for c in number_columns})

        if self.subset:
            vdg_dask = vdg_dask.get_partition(0)

        # perform data cleaning
        # Remove any blank countries
        vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()]
        # Filter out invalid dates
        vdg_dask = vdg_dask[~vdg_dask.Date.isnull()]

        # You should set the index to Country_Region and ensure the output reads back with meaningful divisions
        # vdg_dask = vdg_dask.set_index("Country_Region")
        vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype(
            int)

        # write_dask parquet file output with gzip compression.
        vdg_output = vdg_dask
        self.output().write_dask(vdg_output, compression="gzip")
Example #29
0
class CovidDataGlobalCleanupTask(Task):
    """Luigi Task to clean Covid time series data. The input is from
    External Task that specifies files in GIT. The cleaning from below code handles
    removing rows with null date and confirmed cases values are non-zero.
    The default parameters can be overridden for testing and I have overridden for
    all test cases.

    Parameters:
        subset: bool, True to process one partition, False to process the entire dataset
                    default: True
        data_root: str, base directory to store cleaned output files

    Output:
        Dataframe stored in compressed Parquet format

    """

    # default parameters
    subset = BoolParameter(default=True)
    data_root = Parameter(default="./data/covid/")

    # External task completion is required, to work with GIT / CSVTarget
    requires = Requires()
    input_data = Requirement(CovidDataGlobalTask)

    # TargetOutput returns ParquetTarget
    output = TargetOutput(
        "{task.data_root}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
        storage_options=None,
    )

    def run(self):
        """
        Clean Covid data from Task input and stores dataframe in Parquet format.

        :return:
            File content is stored in the data directory
        """

        # The columns [460 plus date data column contains confirmed covid cases numbers]
        # are all integers. However, given there are missing values, you must first
        # read them as floats, fill nan's as 0, then convert to int.
        # You can provide a dict of {col: dtype} when providing the dtype arg in places like
        # read_parquet and astype.
        est = timezone("EST")
        cur_date = datetime.datetime.now(est)
        logging.info(cur_date)
        number_of_days = (
            cur_date - datetime.datetime.strptime("1/22/20", "%m/%d/%y").astimezone(est)
        ).days
        logging.info(number_of_days)
        number_columns = list()
        for days in range(1, number_of_days):
            number_columns.append(
                (datetime.datetime.now(est) - datetime.timedelta(days=days)).strftime(
                    "%-m/%-d/%y"
                )
            )
        logging.info(number_columns)
        # Ensure that the date column is parsed as a pandas datetime using parse_dates
        cdg_dask = self.input()["input_data"].read_dask(
            dtype={c: "float" for c in number_columns}
        )

        if self.subset:
            cdg_dask = cdg_dask.get_partition(0)

        # perform data cleaning
        # Remove any blank countries
        cdg_dask = cdg_dask[~cdg_dask[cdg_dask.columns[1]].isnull()]

        # You should set the index to Country_Region and ensure the output reads back with meaningful divisions
        # vdg_dask = vdg_dask.set_index("Country_Region")
        cdg_dask[number_columns] = cdg_dask[number_columns].fillna(0).astype(int)

        # write_dask parquet file output with gzip compression.
        cdg_output = cdg_dask
        self.output().write_dask(cdg_output, compression="gzip")
Example #30
0
class GenerateRST(Task):

    force = BoolParameter(default=False)
    format = Parameter()
    section = Parameter(default=None)

    def __init__(self, *args, **kwargs):
        super(GenerateRST, self).__init__(*args, **kwargs)
        if self.force:
            shell('rm -rf catalog/source/*/*')

    def requires(self):
        requirements = {'meta': OBSMetaToLocal(force=True)}
        return requirements

    def output(self):
        tables = ['obs_meta', 'obs_meta_geom']
        if not all([
                PostgresTarget('observatory', t, non_empty=False).exists()
                for t in tables
        ]):
            return []

        targets = {}
        session = current_session()

        resp = session.execute('''
          WITH subquery AS (SELECT
            foo.geom_id,
            CASE WHEN foo.key LIKE 'section%' THEN foo.key ELSE NULL END section,
            CASE WHEN foo.key LIKE 'subsection%' THEN foo.key ELSE NULL END subsection
            FROM observatory.obs_meta_geom,
               LATERAL (SELECT geom_id, * FROM jsonb_each(geom_tags)) foo),
          subquery2 as (SELECT
            geom_id,
            REPLACE(MAX(section), 'section/', '') section,
            REPLACE(MAX(subsection), 'subsection/', '') subsection
            FROM subquery GROUP BY geom_id)
          SELECT DISTINCT UNNEST(section_tags), unnested.subsection_tags
          FROM observatory.obs_meta, LATERAL (SELECT UNNEST(subsection_tags) AS subsection_tags) unnested
          UNION ALL
          SELECT DISTINCT section, subsection
          FROM subquery2
          WHERE section IS NOT NULL
            AND subsection IS NOT NULL
        ''')
        for section_id, subsection_id in resp:
            if self.section:
                if not section_id.startswith(self.section):
                    continue
            targets[(section_id, subsection_id)] = LocalTarget(
                'catalog/source/{section}/{subsection}.rst'.format(
                    section=strip_tag_id(section_id),
                    subsection=strip_tag_id(subsection_id)))

        targets[('licenses',
                 None)] = LocalTarget('catalog/source/licenses.rst')
        targets[('sources', None)] = LocalTarget('catalog/source/sources.rst')

        return targets

    def template_globals(self):
        return {}

    def build_licenses(self, target):
        session = current_session()
        fhandle = target.open('w')
        fhandle.write(
            LICENSES_TEMPLATE.render(licenses=session.query(OBSTag).filter(
                OBSTag.type == 'license').order_by(OBSTag.name),
                                     **self.template_globals()))
        fhandle.close()

    def build_sources(self, target):
        session = current_session()
        fhandle = target.open('w')
        fhandle.write(
            SOURCES_TEMPLATE.render(sources=session.query(OBSTag).filter(
                OBSTag.type == 'source').order_by(OBSTag.name),
                                    **self.template_globals()))
        fhandle.close()

    def run(self):
        session = current_session()

        for section_subsection, target in self.output().items():
            section_id, subsection_id = section_subsection

            if section_id == 'licenses':
                self.build_licenses(target)
                continue
            elif section_id == 'sources':
                self.build_sources(target)
                continue

            section = session.query(OBSTag).get(section_id)
            subsection = session.query(OBSTag).get(subsection_id)

            LOGGER.info('%s:', section_subsection)

            if subsection_id == 'tags.boundary':
                column_tree, all_columns = self._boundaries_tree(
                    section_id, subsection_id)
            else:
                column_tree, all_columns = self._numerators_tree(
                    section_id, subsection_id)

            subsection_path = 'catalog/source/{section}/{subsection}/'.format(
                section=strip_tag_id(section_id),
                subsection=strip_tag_id(subsection_id))
            if not os.path.exists(subsection_path):
                os.makedirs(subsection_path)
            with open('catalog/source/{}.rst'.format(strip_tag_id(section_id)), 'w') \
                    as section_fhandle:
                section_fhandle.write(
                    SECTION_TEMPLATE.render(section=section,
                                            **self.template_globals()))

            target.makedirs()
            fhandle = target.open('w')
            fhandle.write(
                SUBSECTION_TEMPLATE.render(subsection=subsection,
                                           format=self.format,
                                           **self.template_globals()))

            fhandle.close()

            if not all_columns:
                continue

            self._write_column_tree(
                [strip_tag_id(section_id),
                 strip_tag_id(subsection_id)], column_tree, all_columns)

    def _boundaries_tree(self, section_id, subsection_id):
        boundaries_list_result = current_session().execute('''
            SELECT DISTINCT c.id
            FROM observatory.obs_tag section_t,
                    observatory.obs_column_tag section_ct,
                    observatory.obs_tag subsection_t,
                    observatory.obs_column_tag subsection_ct,
                    observatory.obs_column c
            WHERE section_t.id = section_ct.tag_id
                AND subsection_t.id = subsection_ct.tag_id
                AND c.id = section_ct.column_id
                AND c.id = subsection_ct.column_id
                AND subsection_t.id = '{subsection_id}'
                AND section_t.id = '{section_id}'
                AND subsection_t.type = 'subsection'
                AND section_t.type = 'section'
            GROUP BY c.id
            ORDER BY c.id
        '''.format(section_id=section_id, subsection_id=subsection_id))
        boundary_ids = [row[0] for row in boundaries_list_result.fetchall()]

        boundaries_detail_result = current_session().execute('''
            SELECT c.id,
                    FIRST(c.name),
                    FIRST(c.description),
                    FIRST(c.type),
                    FIRST(ctab.extra),
                    FIRST(c.aggregate),
                    JSONB_Object_Agg(t.type || '/' || t.id, t.name),
                    'name' suggested_name,
                    ARRAY_AGG(DISTINCT tab.timespan) timespan,
                    ARRAY[]::Text[] denoms,
                    ARRAY[]::Text[],
                    ST_AsText(ST_Envelope(FIRST(tab.the_geom))) envelope
            FROM observatory.obs_column c,
                    observatory.obs_column_tag ct,
                    observatory.obs_tag t,
                    observatory.obs_column_table ctab,
                    observatory.obs_table tab
            WHERE c.id = ANY(ARRAY['{}'])
                AND ct.column_id = c.id
                AND ct.tag_id = t.id
                AND c.id = ctab.column_id
                AND tab.id = ctab.table_id
            GROUP BY 1, 8
        '''.format("', '".join(boundary_ids)))
        boundary_data = self._parse_columns(boundaries_detail_result)
        return {k: {} for k in list(boundary_data.keys())}, boundary_data

    def _numerators_tree(self, section_id, subsection_id):
        numerator_paths_result = current_session().execute('''
            WITH RECURSIVE children(numer_id, path) AS (
                SELECT numer_id, ARRAY[]::Text[]
                FROM observatory.obs_meta_numer children
                WHERE numer_tags ? 'subsection/{subsection_id}'
                    AND numer_tags ? 'section/{section_id}'
                    AND numer_weight > 0
                UNION
                SELECT parent.denom_id,
                    children.numer_id || children.path
                FROM observatory.obs_meta parent, children
                WHERE parent.numer_id = children.numer_id
                ) SELECT path from children WHERE numer_id IS NULL;
        '''.format(section_id=section_id, subsection_id=subsection_id))

        numerator_tree = {}
        numerator_ids = set()
        for row in numerator_paths_result.fetchall():
            node = numerator_tree
            for mid in row[0]:
                numerator_ids.add(mid)
                if mid not in node:
                    node[mid] = {}
                node = node[mid]

        numerator_details_result = current_session().execute('''
            SELECT numer_id,
                    numer_name,
                    numer_description,
                    numer_type,
                    numer_extra,
                    numer_aggregate,
                    numer_tags,
                    numer_colname suggested_name,
                    ARRAY[numer_timespan] timespan,
                    ARRAY_AGG(DISTINCT ARRAY[
                    denom_reltype,
                    denom_id,
                    denom_name
                    ]) denoms,
                    ARRAY_AGG(DISTINCT ARRAY[
                    geom_id, geom_name, numer_timespan,
                    geom_tags::Text
                    ]) geom_timespans,
                    FIRST(ST_AsText(ST_Envelope(the_geom))) envelope
            FROM observatory.obs_meta
            WHERE numer_id = ANY (ARRAY['{}'])
            GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
        '''.format("', '".join(numerator_ids)))
        return numerator_tree, self._parse_columns(numerator_details_result)

    def _parse_columns(self, all_columns_result):
        all_columns = {}
        for col in all_columns_result:
            geom_timespans = {}
            for gt in col[10]:
                if gt[0] in geom_timespans:
                    geom_timespans[gt[0]]['timespans'].append(gt[2])
                else:
                    geom_timespans[gt[0]] = {
                        'geom_id': gt[0],
                        'geom_name': gt[1],
                        'timespans': [gt[2]],
                        'geom_tags': json.loads(gt[3])
                    }
            all_columns[col[0]] = {
                'id':
                col[0],
                'latlng':
                catalog_latlng(col[0]),
                'name':
                col[1],
                'description':
                col[2],
                'type':
                col[3],
                'extra':
                col[4],
                'aggregate':
                col[5],
                'tags':
                col[6],
                'suggested_name':
                col[7],
                'timespan':
                col[8],
                'timespan_sluggified':
                underscore_slugify('_'.join(col[8])),
                'licenses': [
                    tag_id.split('/')[1]
                    for tag_id, tag_name in col[6].items()
                    if tag_id.startswith('license/')
                ],
                'sources': [
                    tag_id.split('/')[1]
                    for tag_id, tag_name in col[6].items()
                    if tag_id.startswith('source/')
                ],
                'denoms':
                col[9],
                'geom_timespans':
                geom_timespans,
                'envelope':
                col[11]
            }
        return all_columns

    def _write_column_tree(self, path, tree, all_columns):
        for column_id, subtree in tree.items():
            column_path = path + [column_id]
            self._write_column(column_path, all_columns[column_id],
                               len(subtree))
            if subtree:
                os.makedirs('catalog/source/' + '/'.join(column_path))
                self._write_column_tree(column_path, subtree, all_columns)

    def _write_column(self, path, column, numchildren):
        with open('catalog/source/{path}.rst'.format(path='/'.join(path)),
                  'w') as column_file:
            column_file.write(
                COLUMN_TEMPLATE.render(intermediate_path='/'.join(path[:-1]),
                                       numchildren=numchildren,
                                       col=column,
                                       **self.template_globals()))