Example #1
0
 def _targets_to_remove(self):
     outs = []
     # original data
     # TODO: check if data is local, we don't want to delete that
     download_task = DownloadExperiment(self.experiment_id, source='gemma')
     outs.extend(flatten_output(download_task))
     # any data resulting from trimming raw reads
     trim_task = TrimExperiment(self.experiment_id, source='gemma')
     outs.extend(flatten_output(trim_task))
     return outs
Example #2
0
def infer_bulk_complete_from_fs(task_cls, finite_datehours):
    """
    Efficiently determines missing datehours by filesystem listing.

    The current implementation works for the common case of a task writing
    output to a FileSystemTarget whose path is built using strftime with format
    like '...%Y...%m...%d...%H...', without custom complete() or exists().

    (Eventually Luigi could have ranges of completion as first-class citizens.
    Then this listing business could be factored away/be provided for
    explicitly in target API or some kind of a history server.)

    TODO support RangeDaily
    """
    filesystems_and_globs_by_location = _get_filesystems_and_globs(task_cls)
    paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))]
                         for d in finite_datehours]
    listing = set()
    for (f, g), p in zip(
            filesystems_and_globs_by_location, zip(*paths_by_datehour)
    ):  # transposed, so here we're iterating over logical outputs, not datehours
        listing |= _list_existing(f, g, p)

    # quickly learn everything that's missing
    missing_datehours = []
    for d, p in zip(finite_datehours, paths_by_datehour):
        if not set(p) <= listing:
            missing_datehours.append(d)

    return missing_datehours
Example #3
0
def infer_bulk_complete_from_fs(datetimes, datetime_to_task, datetime_to_re):
    """
    Efficiently determines missing datetimes by filesystem listing.

    The current implementation works for the common case of a task writing
    output to a FileSystemTarget whose path is built using strftime with format
    like '...%Y...%m...%d...%H...', without custom complete() or exists().

    (Eventually Luigi could have ranges of completion as first-class citizens.
    Then this listing business could be factored away/be provided for
    explicitly in target API or some kind of a history server.)
    """
    filesystems_and_globs_by_location = _get_filesystems_and_globs(datetime_to_task, datetime_to_re)
    paths_by_datetime = [[o.path for o in flatten_output(datetime_to_task(d))] for d in datetimes]
    listing = set()
    for (f, g), p in zip(
        filesystems_and_globs_by_location, zip(*paths_by_datetime)
    ):  # transposed, so here we're iterating over logical outputs, not datetimes
        listing |= _list_existing(f, g, p)

    # quickly learn everything that's missing
    missing_datetimes = []
    for d, p in zip(datetimes, paths_by_datetime):
        if not set(p) <= listing:
            missing_datetimes.append(d)

    return missing_datetimes
Example #4
0
def _get_filesystems_and_globs(datetime_to_task, datetime_to_re):
    """
    Yields a (filesystem, glob) tuple per every output location of task.

    The task can have one or several FileSystemTarget outputs.

    For convenience, the task can be a luigi.WrapperTask,
    in which case outputs of all its dependencies are considered.
    """
    # probe some scattered datetimes unlikely to all occur in paths, other than by being sincere datetime parameter's representations
    # TODO limit to [self.start, self.stop) so messages are less confusing? Done trivially it can kill correctness
    sample_datetimes = [datetime(y, m, d, h) for y in range(2000, 2050, 10) for m in range(1, 4) for d in range(5, 8) for h in range(21, 24)]
    regexes = [re.compile(datetime_to_re(d)) for d in sample_datetimes]
    sample_tasks = [datetime_to_task(d) for d in sample_datetimes]
    sample_outputs = [flatten_output(t) for t in sample_tasks]

    for o, t in zip(sample_outputs, sample_tasks):
        if len(o) != len(sample_outputs[0]):
            raise NotImplementedError("Outputs must be consistent over time, sorry; was %r for %r and %r for %r" % (o, t, sample_outputs[0], sample_tasks[0]))
            # TODO fall back on requiring last couple of days? to avoid astonishing blocking when changes like that are deployed
            # erm, actually it's not hard to test entire hours_back..hours_forward and split into consistent subranges FIXME?
        for target in o:
            if not isinstance(target, FileSystemTarget):
                raise NotImplementedError("Output targets must be instances of FileSystemTarget; was %r for %r" % (target, t))

    for o in zip(*sample_outputs):  # transposed, so here we're iterating over logical outputs, not datetimes
        glob = _get_per_location_glob(sample_tasks, o, regexes)
        yield o[0].fs, glob
Example #5
0
def _get_filesystems_and_globs(task_cls):
    """
    Yields a (filesystem, glob) tuple per every output location of task_cls.

    task_cls can have one or several FileSystemTarget outputs.

    For convenience, task_cls can be a wrapper task,
    in which case outputs of all its dependencies are considered.
    """
    # probe some scattered datehours unlikely to all occur in paths, other than by being sincere datehour parameter's representations
    # TODO limit to [self.start, self.stop) so messages are less confusing? Done trivially it can kill correctness
    sample_datehours = [datetime(y, m, d, h) for y in range(2000, 2050, 10) for m in range(1, 4) for d in range(5, 8) for h in range(21, 24)]
    regexes = [re.compile('(%04d).*(%02d).*(%02d).*(%02d)' % (d.year, d.month, d.day, d.hour)) for d in sample_datehours]
    sample_tasks = [task_cls(d) for d in sample_datehours]
    sample_outputs = [flatten_output(t) for t in sample_tasks]

    for o, t in zip(sample_outputs, sample_tasks):
        if len(o) != len(sample_outputs[0]):
            raise NotImplementedError("Outputs must be consistent over time, sorry; was %r for %r and %r for %r" % (o, t, sample_outputs[0], sample_tasks[0]))
            # TODO fall back on requiring last couple of days? to avoid astonishing blocking when changes like that are deployed
            # erm, actually it's not hard to test entire hours_back..hours_forward and split into consistent subranges FIXME?
        for target in o:
            if not isinstance(target, FileSystemTarget):
                raise NotImplementedError("Output targets must be instances of FileSystemTarget; was %r for %r" % (target, t))

    for o in zip(*sample_outputs):  # transposed, so here we're iterating over logical outputs, not datehours
        glob = _get_per_location_glob(sample_tasks, o, regexes)
        yield o[0].fs, glob
Example #6
0
def remove_task_output(task):
    logger.info('Cleaning up %s...', repr(task))
    for out in flatten_output(task):
        if hasattr(out, 'remove') and out.exists():
            try:
                out.remove()
                logger.info('Removed %s.', repr(out))
            except:
                logger.exception('Failed to remove %s.', repr(out))
Example #7
0
 def on_failure(self, err):
     logger.info('Removing task output of %s due to failure.', repr(self))
     for out in flatten_output(self):
         if out.exists() and hasattr(out, 'remove'):
             try:
                 out.remove()
             except:
                 logger.exception('Failed to remove output %s while cleaning up %s.', repr(out), repr(self))
     return super(RemoveTaskOutputOnFailureMixin, self).on_failure(err)
Example #8
0
    def missing_datehours(self, task_cls, finite_datehours):
        """Infers them by listing the task output target(s) filesystem.
        """
        filesystems_and_globs_by_location = self._get_filesystems_and_globs(task_cls)
        paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))] for d in finite_datehours]
        listing = set()
        for (f, g), p in zip(filesystems_and_globs_by_location, zip(*paths_by_datehour)):  # transposed, so here we're iterating over logical outputs, not datehours
            listing |= self._list_existing(f, g, p)

        # quickly learn everything that's missing
        missing_datehours = []
        for d, p in zip(finite_datehours, paths_by_datehour):
            if not set(p) <= listing:
                missing_datehours.append(d)

        return missing_datehours
Example #9
0
    def missing_datehours(self, task_cls, finite_datehours):
        """Infers them by listing the task output target(s) filesystem.
        """
        filesystems_and_globs_by_location = self._get_filesystems_and_globs(
            task_cls)
        paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))]
                             for d in finite_datehours]
        listing = set()
        for (f, g), p in zip(
                filesystems_and_globs_by_location, zip(*paths_by_datehour)
        ):  # transposed, so here we're iterating over logical outputs, not datehours
            listing |= self._list_existing(f, g, p)

        # quickly learn everything that's missing
        missing_datehours = []
        for d, p in zip(finite_datehours, paths_by_datehour):
            if not set(p) <= listing:
                missing_datehours.append(d)

        return missing_datehours
Example #10
0
 def run(self):
     for out in flatten_output(self):
         out.makedirs()
     return super(CreateTaskOutputDirectoriesBeforeRunMixin, self).run()