Ejemplo n.º 1
0
 def clone(self, target=_UNSET, via_column=_UNSET, name=_UNSET, bounds=_UNSET, **coax_kwargs):
   kwargs = self.coax_kwargs
   kwargs.update(coax_kwargs)
   return How(
     target=pdutils.coalesce(target, self.target, unset=_UNSET),
     via_column=pdutils.coalesce(via_column, self.via_column, unset=_UNSET),
     name=pdutils.coalesce(name, self._name, unset=_UNSET),
     bounds=pdutils.coalesce(bounds, self.bounds, unset=_UNSET),
     **kwargs,
   )
Ejemplo n.º 2
0
 def __repr__(self):
   # NOTE: Do not change this to use any values that require additional arrays to be fetched!
   num_chunks = pdutils.coalesce(self._matched_num_chunks, '?')
   num_rows = pdutils.coalesce(self._matched_num_rows, '?')
   csv_bytes = pdutils.coalesce(self._matched_csv_bytes, '?')
   gz_bytes = pdutils.coalesce(self._matched_gz_bytes, '?')
   parts = [
     'chunks=%s' % (num_chunks,),
     'rows=%s' % (strutils.format_number(num_rows, units=strutils.NUMBER_UNITS),),
     'csv_bytes=%s' % (strutils.format_number(csv_bytes, units=strutils.SI_UNITS),),
     'gz_bytes=%s' % (strutils.format_number(gz_bytes, units=strutils.SI_UNITS),),
   ]
   return '<%s %r %s>' % (self.__class__.__name__, str(self.layout),
                          ' '.join(x for x in parts if x))
Ejemplo n.º 3
0
def weighted_tqdm(items, weights=None, default_weight=0, total=None, **kwargs):
    weights = pdutils.coalesce(weights, {})
    total = total if not pdutils.is_empty(total) else (
        sum(weights.values()) if weights else None)
    if not weights and not default_weight:
        yield from items
    else:
        with tqdm.tqdm(items, total=total, **kwargs) as tq:
            for item in items:
                yield item
                weight = weights.get(item, default_weight)
                tq.update(weight)
Ejemplo n.º 4
0
def allocate(csv_path_size, num_groups):
    def insert_chunk(allocation, chunk):
        # The smallest group is in groups[0].
        group = allocation.groups.pop(0)
        group_size = allocation.group_size.pop(0)
        # Insert chunk into the smallest group in the correct sort order (smallest-largest chunks).
        chunk_index = bisect.bisect_left(group.chunk_size, chunk.total_size)
        group.chunks.insert(chunk_index, chunk)
        group.chunk_size.insert(chunk_index, chunk.total_size)
        # Reinsert group back into the allocation in the correct sort order (smallest-largest groups).
        group_size = group_size + chunk.total_size
        group_index = bisect.bisect_left(allocation.group_size, group_size)
        allocation.groups.insert(group_index, group)
        allocation.group_size.insert(group_index, group_size)

    def optimize(allocation, threshold_percent=0.1):
        while True:
            diff_size = (allocation.group_size[-1] - allocation.group_size[0])
            diff_percent = diff_size / (allocation.group_size[0] or diff_size)
            if diff_percent <= threshold_percent:
                break
            # The largest group is in groups[-1].
            largest_group = allocation.groups.pop(-1)
            largest_group_size = allocation.group_size.pop(-1)
            # Remove the largest chunk from the largest group.
            largest_chunk = largest_group.chunks.pop(-1)
            _ = largest_group.chunk_size.pop(-1)
            # Re-insert the (formerly) largest_group back where it belongs.
            # TODO(rob): insert_chunk pointlessly pops and re-inserts this again immediately.
            largest_group_size = largest_group_size - largest_chunk.total_size
            group_index = bisect.bisect_left(allocation.group_size,
                                             largest_group_size)
            allocation.groups.insert(group_index, largest_group)
            allocation.group_size.insert(group_index, largest_group_size)
            # Cut the largest chunk it into 2, and re-insert it.
            head_chunk, bomb_chunk = largest_chunk.split()
            insert_chunk(allocation=allocation, chunk=head_chunk)
            insert_chunk(allocation=allocation, chunk=bomb_chunk)

    # Create initial allocation of whole csv_path to groups.
    num_groups = pdutils.coalesce(num_groups, NUM_CPUS)
    allocation = Allocation(num_groups=num_groups, num_files=0)
    for csv_path, csv_size in csv_path_size:
        allocation.num_files += 1
        csv_range = (0, csv_size)
        chunk = Chunk(path=csv_path, size_range=csv_range, total_size=csv_size)
        insert_chunk(allocation=allocation, chunk=chunk)
    # Optimize groups by halving the largest chunk from the largest group.
    optimize(allocation=allocation)
    return allocation
Ejemplo n.º 5
0
 def how_snug_labeler(how, layouts):
   if how.target.labeler is not None:
     # How.target has a labeler; may be LabelColumnTarget or GTypeLabelerTarget.
     gtype = how.target.gtype
     snug_step = how.target.labeler.step
     snug_head = how.target.labeler.head
   elif layouts:
     # Extract the minimum value of step for this how across all layouts.
     layout_gtypes = set()
     layout_steps = set()
     layout_heads = set()
     for layout in layouts:
       # Bind how to layout if possible; ignore this how if it can't be bound to layout (DROP).
       # This will legitimately happen with the 2 layouts: ('lat_lng', 'year_month').
       bound_how = how.bind_layout(layout=layout, errors_missing=lib_errors.DROP)
       if bound_how is not None:
         layout_gtypes.add(bound_how.target.gtype)
         layout_steps.add(bound_how.target.labeler.step)
         layout_heads.add(bound_how.target.labeler.head)
     if len(layout_gtypes) > 1:
       # Since a GTypeLabeler can label exactly one gtype, crash if we found multiple.
       raise lib_errors.MultipleLabelColumnsError('found %d gtype matches in layouts: %r' %
                                                  (len(layout_gtypes), how))
     if len(layout_gtypes) == 0:
       # A spurious how clause (no matches with any layouts) also cannot create a labeler.
       raise lib_errors.NoLabelColumnsError('found 0 gtype matches in layouts: %r' % (how,))
     # We were provided with at least one layout; use min(step/head) for their labeler.
     gtype = more_itertools.one(layout_gtypes)
     snug_step = min(layout_steps)
     snug_head = min(layout_heads)
   else:
     # With no provided layouts, how.target *must* have a gtype or we can't create a labeler.
     gtype = how.target.gtype
     snug_step = gtype.SANE_STEP
     snug_head = pdutils.coalesce(gtype.HEAD, 0)
   return gtype.labeler(step=snug_step, head=snug_head)
Ejemplo n.º 6
0
def balance(csv_paths,
            out_dir,
            num_groups=None,
            max_workers=None,
            relative_to=None,
            symlink_ok=True,
            csv_path_weights=None,
            tqdm_desc='splitting'):
    """Computes an allocation and materializes it by splitting large files and symlinking others.

  This function creates roughly equal-sized directories of csvs inside `out_dir`, in preparation for
  processing them in parallel.

  Confusingly, the `num_groups` parameter specifies the number of directories to allocate csvs into,
  while the `max_workers` parameter specifies the number of workers to use while materializing the
  allocation.

  csv_paths: iterable of csv file paths
  out_dir: non-existant directory to write new files into
  num_groups: number of groups to allocate files into; default max_workers
  max_workers: max number of parallel processes to run; default NUM_CPUS
  relative_to: drop this prefix when writing files into out_dir; default csv_paths common prefix
  symlink_ok: if True, symlinked non-split files into out_dir; if False, copy files
  csv_path_weights {path: int}: optional mapping of path to allocation weight; default csv bytes
  """
    max_workers = pdutils.coalesce(max_workers, NUM_CPUS)
    num_groups = pdutils.coalesce(num_groups, max_workers)
    # Read the number of rows in each csv file, and allocate them into num_groups.
    csv_paths = tuple(csv_paths)
    if csv_path_weights is None:
        csv_path_weights = dict(csv_path_bytes(csv_paths=csv_paths))
    logging.info('allocating csvs into %s groups...' % num_groups)
    allocation = allocate(csv_path_size=tuple(csv_path_weights.items()),
                          num_groups=num_groups)
    logging.info(allocation)
    # Perform all required splits into their split_num directory.
    relative_to = relative_to or os.path.commonpath(csv_paths)
    file_splits = allocation.file_splits()
    job_kw = {
        'out_dir': str(out_dir),
        'relative_to': str(relative_to),
        'num_groups': num_groups,
        'symlink_ok': symlink_ok
    }
    jobs = ((_write_groups, dict(path=str(path), file_splits=fs, **job_kw))
            for path, fs in file_splits.items())
    with tqdm.tqdm(desc=tqdm_desc,
                   total=sum(csv_path_weights.values()),
                   mininterval=1,
                   maxinterval=1) as tq:
        with futures.ProcessPoolExecutor(max_workers=max_workers) as pool:
            pending = set(
                pool.submit(fn, **kw)
                for fn, kw in itertools.islice(jobs, max_workers * 2))
            while pending:
                done, pending = futures.wait(
                    pending, return_when=futures.FIRST_COMPLETED)
                for job in done:
                    finished_path = pathlib.Path(job.result())
                    tq.update(csv_path_weights[finished_path])
                for fn, kw in itertools.islice(jobs, len(done)):
                    pending.add(pool.submit(fn, **kw))
    return allocation
Ejemplo n.º 7
0
 def clone(self, target=_UNSET, head_bombs=_UNSET):
   return SequenceFilter(
     target=pdutils.coalesce(target, self.target, unset=_UNSET),
     head_bombs=pdutils.coalesce(head_bombs, self.head_bombs, unset=_UNSET),
   )
Ejemplo n.º 8
0
 def labeler(cls, step=None, head=None, depth=None):
   if ((depth is not None and step is not None) or (depth is None and step is None)):
     raise ValueError('must provide one of depth, step/head: depth=%r, step=%r' % (depth, step))
   step = pdutils.coalesce(step, (depth is not None) and cls.depth_step(depth=depth))
   head = pdutils.coalesce(head, cls.HEAD)
   return super().labeler(step=step, head=head)
Ejemplo n.º 9
0
 def depth_step(cls, depth, head=None, bomb=None):
   head = pdutils.coalesce(head, cls.HEAD)
   bomb = pdutils.coalesce(bomb, cls.BOMB, cls.TAIL)
   return (bomb - head) / 2**depth
Ejemplo n.º 10
0
 def bucket(cls, val, step, head=None):
   head = pdutils.coalesce(head, cls.HEAD, 0)
   if is_scalar(val):
     return cls._bucket_scalar(val, step=step, head=head)
   return cls._bucket_pd(val, step=step, head=head)