Exemple #1
0
def init_db(chanjo_db, bed_stream, overwrite=False):
    """Build a new database instance from the Chanjo BED stream.

  Args:
    chanjo_db (Store): initialized Store class instance
    bed_stream (sequence): Chanjo-style BED-stream
    overwrite (bool, optional): whether to automatically overwrite an
      existing database, defaults to False
  """
    # check if the database already exists (expect 'mysql' to exist)
    # 'dialect' is in the form of '<db_type>+<connector>'
    if chanjo_db.dialect == 'mysql' or path(chanjo_db.uri).exists():
        if overwrite:
            # wipe the database clean with a warning
            chanjo_db.tear_down()
        elif chanjo_db.dialect == 'sqlite':
            # prevent from wiping existing database to easily
            raise OSError(errno.EEXIST, os.strerror(errno.EEXIST),
                          chanjo_db.uri)

    # set up new tables
    chanjo_db.set_up()

    superblocks = pipe(bed_stream, map(text_type.rstrip), map(split(sep='\t')),
                       map(lambda row: bed_to_interval(*row)),
                       map(build_interval(chanjo_db)), concat, aggregate,
                       map(build_block(chanjo_db)), aggregate,
                       map(build_superblock(chanjo_db)))

    # reduce the superblocks and commit every contig
    reduce(commit_per_contig(chanjo_db), superblocks, 'chr0')

    # commit also the last contig
    chanjo_db.save()
def set_common_materials(*universes) -> tp.NoReturn:
    universes_collection = toolz.reduce(
        set.union, map(mk.Universe.get_universes, universes))
    common_materials = toolz.reduce(
        set.union, map(mk.Universe.get_compositions, universes_collection))
    for u in universes_collection:
        u.set_common_materials(common_materials)
Exemple #3
0
    def fold(self, binop, combine=None, initial=no_default, split_every=None):
        """ Parallelizable reduction

        Fold is like the builtin function ``reduce`` except that it works in
        parallel.  Fold takes two binary operator functions, one to reduce each
        partition of our dataset and another to combine results between
        partitions

        1.  ``binop``: Binary operator to reduce within each partition
        2.  ``combine``:  Binary operator to combine results from binop

        Sequentially this would look like the following:

        >>> intermediates = [reduce(binop, part) for part in partitions]  # doctest: +SKIP
        >>> final = reduce(combine, intermediates)  # doctest: +SKIP

        If only one function is given then it is used for both functions
        ``binop`` and ``combine`` as in the following example to compute the
        sum:

        >>> def add(x, y):
        ...     return x + y

        >>> b = from_sequence(range(5))
        >>> b.fold(add).compute()  # doctest: +SKIP
        10

        In full form we provide both binary operators as well as their default
        arguments

        >>> b.fold(binop=add, combine=add, initial=0).compute()  # doctest: +SKIP
        10

        More complex binary operators are also doable

        >>> def add_to_set(acc, x):
        ...     ''' Add new element x to set acc '''
        ...     return acc | set([x])
        >>> b.fold(add_to_set, set.union, initial=set()).compute()  # doctest: +SKIP
        {1, 2, 3, 4, 5}

        See Also
        --------

        Bag.foldby
        """
        token = tokenize(self, binop, combine, initial)
        combine = combine or binop
        a = 'foldbinop-{0}-{1}'.format(funcname(binop), token)
        b = 'foldcombine-{0}-{1}'.format(funcname(combine), token)
        initial = quote(initial)
        if initial is not no_default:
            return self.reduction(curry(_reduce, binop, initial=initial),
                                  curry(_reduce, combine),
                                  split_every=split_every)
        else:
            from toolz.curried import reduce
            return self.reduction(reduce(binop), reduce(combine),
                                  split_every=split_every)
Exemple #4
0
    def fold(self, binop, combine=None, initial=no_default, split_every=None):
        """ Parallelizable reduction

        Fold is like the builtin function ``reduce`` except that it works in
        parallel.  Fold takes two binary operator functions, one to reduce each
        partition of our dataset and another to combine results between
        partitions

        1.  ``binop``: Binary operator to reduce within each partition
        2.  ``combine``:  Binary operator to combine results from binop

        Sequentially this would look like the following:

        >>> intermediates = [reduce(binop, part) for part in partitions]  # doctest: +SKIP
        >>> final = reduce(combine, intermediates)  # doctest: +SKIP

        If only one function is given then it is used for both functions
        ``binop`` and ``combine`` as in the following example to compute the
        sum:

        >>> def add(x, y):
        ...     return x + y

        >>> b = from_sequence(range(5))
        >>> b.fold(add).compute()  # doctest: +SKIP
        10

        In full form we provide both binary operators as well as their default
        arguments

        >>> b.fold(binop=add, combine=add, initial=0).compute()  # doctest: +SKIP
        10

        More complex binary operators are also doable

        >>> def add_to_set(acc, x):
        ...     ''' Add new element x to set acc '''
        ...     return acc | set([x])
        >>> b.fold(add_to_set, set.union, initial=set()).compute()  # doctest: +SKIP
        {1, 2, 3, 4, 5}

        See Also
        --------

        Bag.foldby
        """
        combine = combine or binop
        initial = quote(initial)
        if initial is not no_default:
            return self.reduction(curry(_reduce, binop, initial=initial),
                                  curry(_reduce, combine),
                                  split_every=split_every)
        else:
            from toolz.curried import reduce
            return self.reduction(reduce(binop),
                                  reduce(combine),
                                  split_every=split_every)
Exemple #5
0
def test_fold():
    assert fold(add, range(10), 0) == reduce(add, range(10), 0)
    assert fold(add, range(10), 0, chunksize=2) == reduce(add, range(10), 0)
    assert fold(add, range(10)) == fold(add, range(10), 0)

    def setadd(s, item):
        s = s.copy()
        s.add(item)
        return s

    assert fold(setadd, [1, 2, 3], set()) == set((1, 2, 3))
    assert (fold(setadd, [1, 2, 3], set(), chunksize=2,
                 combine=set.union) == set((1, 2, 3)))
Exemple #6
0
def test_fold():
    assert fold(add, range(10), 0) == reduce(add, range(10), 0)
    assert fold(add, range(10), 0, chunksize=2) == reduce(add, range(10), 0)
    assert fold(add, range(10)) == fold(add, range(10), 0)

    def setadd(s, item):
        s = s.copy()
        s.add(item)
        return s

    assert fold(setadd, [1, 2, 3], set()) == set((1, 2, 3))
    assert (fold(setadd, [1, 2, 3], set(), chunksize=2, combine=set.union)
            == set((1, 2, 3)))
Exemple #7
0
 def add_dict(self, line):
     hex_dict = {}  # 初始化字典放置每行信息
     if line[0] != ":":
         # print(line[0])
         return 1
     hex_dict["data_len"] = int(line[1:3], 16)
     if len(line
            ) != 2 * hex_dict["data_len"] + 11 or hex_dict["data_len"] == 0:
         # print(hex_dict["data_len"], len(line))
         return 2  # 最后一行或数据长不匹配返回
     hex_dict["data_type"] = int(line[7:9], 16)
     if hex_dict["data_type"] not in (0, 1, 2, 4):
         return 1
     if hex_dict["data_type"] == 2:
         self.addr_offset = int(line[9:13], 16) << 4
         return 0
     elif hex_dict["data_type"] == 4:
         self.addr_offset = int(line[9:13], 16) << 16
         return 0
     hex_dict["data_addr"] = int(line[3:7], 16) + self.addr_offset
     data = re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ",
                   line[9:9 + hex_dict["data_len"] * 2])
     hex_dict["data"] = hexStringB2Hex(data)
     # 以下用作校验hex行
     line = char2hex(line[1:])
     check_sum = (0x100 -
                  (reduce(lambda x, y: x + y, line[:-1]) % 256)) % 256
     if check_sum == line[-1]:
         hex_dict["check"] = line[-1]
         # print(hex_dict)
         self.hex_dicts.append(hex_dict)
         return 0
     else:
         return 3
def column_map(tables, columns):
    """
    Take a list of tables and a list of column names and resolve which
    columns come from which table.

    Parameters
    ----------
    tables : sequence of _DataFrameWrapper or _TableFuncWrapper
        Could also be sequence of modified pandas.DataFrames, the important
        thing is that they have ``.name`` and ``.columns`` attributes.
    columns : sequence of str
        The column names of interest.

    Returns
    -------
    col_map : dict
        Maps table names to lists of column names.

    """
    if not columns:
        return {t.name: None for t in tables}

    columns = set(columns)
    colmap = {t.name: list(set(t.columns).intersection(columns)) for t in tables}
    foundcols = toolz.reduce(lambda x, y: x.union(y), (set(v) for v in colmap.values()))
    if foundcols != columns:
        raise RuntimeError('Not all required columns were found. '
                           'Missing: {}'.format(list(columns - foundcols)))
    return colmap
Exemple #9
0
def column_map(tables, columns):
    """
    Take a list of tables and a list of column names and resolve which
    columns come from which table.

    Parameters
    ----------
    tables : sequence of _DataFrameWrapper or _TableFuncWrapper
        Could also be sequence of modified pandas.DataFrames, the important
        thing is that they have ``.name`` and ``.columns`` attributes.
    columns : sequence of str
        The column names of interest.

    Returns
    -------
    col_map : dict
        Maps table names to lists of column names.

    """
    if not columns:
        return {t.name: None for t in tables}

    columns = set(columns)
    colmap = {
        t.name: list(set(t.columns).intersection(columns))
        for t in tables
    }
    foundcols = toolz.reduce(lambda x, y: x.union(y),
                             (set(v) for v in colmap.values()))
    if foundcols != columns:
        raise RuntimeError('Not all required columns were found. '
                           'Missing: {}'.format(list(columns - foundcols)))
    return colmap
Exemple #10
0
def test_fold():
    assert fold(add, range(10), 0) == reduce(add, range(10), 0)
    assert fold(add, range(10), 0, map=Pool().map) == reduce(add, range(10), 0)
    assert fold(add, range(10), 0, chunksize=2) == reduce(add, range(10), 0)
    assert fold(add, range(10)) == fold(add, range(10), 0)

    def setadd(s, item):
        s = s.copy()
        s.add(item)
        return s

    assert fold(setadd, [1, 2, 3], set()) == {1, 2, 3}
    assert (fold(setadd, [1, 2, 3], set(), chunksize=2,
                 combine=set.union) == {1, 2, 3})

    assert fold(add, range(10), default=no_default2) == fold(add, range(10))
Exemple #11
0
def test_batch_faithful():
    "joining the batches must result the unbatched data"
    X = list(range(11))
    the_batches = batch(X, batchsize=3)
    X_debatched = toolz.reduce(lambda l1, l2: l1 + l2, the_batches)

    assert X_debatched == X, 'different ouput prodcued'
Exemple #12
0
def stack(*imgs, **kwargs):
    """Combine images together, overlaying later images onto earlier ones.

    Parameters
    ----------
    imgs : iterable of Image
        The images to combine.
    how : str, optional
        The compositing operator to combine pixels. Default is `'over'`.
    """
    if not imgs:
        raise ValueError("No images passed in")
    shapes = []
    for i in imgs:
        if not isinstance(i, Image):
            raise TypeError("Expected `Image`, got: `{0}`".format(type(i)))
        elif not shapes:
            shapes.append(i.shape)
        elif shapes and i.shape not in shapes:
            raise ValueError("The stacked images must have the same shape.")

    name = kwargs.get('name', None)
    op = composite_op_lookup[kwargs.get('how', 'over')]
    if len(imgs) == 1:
        return imgs[0]
    imgs = xr.align(*imgs, copy=False, join='outer')
    with np.errstate(divide='ignore', invalid='ignore'):
        out = tz.reduce(tz.flip(op), [i.data for i in imgs])
    return Image(out, coords=imgs[0].coords, dims=imgs[0].dims, name=name)
Exemple #13
0
def estimate_graph_size(old_chunks, new_chunks):
    """ Estimate the graph size during a rechunk computation.
    """
    # Estimate the number of intermediate blocks that will be produced
    # (we don't use intersect_chunks() which is much more expensive)
    crossed_size = reduce(mul, (len(oc) + len(nc)
                                for oc, nc in zip(old_chunks, new_chunks)))
    return crossed_size
Exemple #14
0
def estimate_graph_size(old_chunks, new_chunks):
    """ Estimate the graph size during a rechunk computation.
    """
    # Estimate the number of intermediate blocks that will be produced
    # (we don't use intersect_chunks() which is much more expensive)
    crossed_size = reduce(mul, (len(oc) + len(nc)
                                for oc, nc in zip(old_chunks, new_chunks)))
    return crossed_size
Exemple #15
0
def gamma_product(m_tuple, n):
    sum_of_indexed_entries_by_position = [
        la.sum_of_entries_indexed_by_i(m_tuple, n, i) for i in range(n)
    ]
    product_of_gammas = reduce(
        lambda x, y: x * y,
        map(mem.gamma_n_plus_1_over_2, sum_of_indexed_entries_by_position))
    return Decimal(product_of_gammas)
def wrap(call, middleware=None):
    if middleware is None:
        middleware = []
    return reduce(
        lambda acc, m: lambda ctx: m(ctx, acc),
        reversed(middleware),
        lambda ctx: call(ctx),
    )
Exemple #17
0
 def _set_central_entries(self, central_entries):
     append_and_sum = partial(_append_and_sum_central_entry, self)
     allocation = reduce(append_and_sum, central_entries, {
         'labor_allocation': 0.00,
         'cost_allocation': 0.00
     })
     self.labor_allocation = allocation['labor_allocation']
     self.cost_allocation = allocation['cost_allocation']
     self.total_allocation = self.labor_allocation + self.cost_allocation
def cross(dists, f=None):
    if f is None:
        f = lambda *x: x
    outcomes = Counter()
    for outcome_probs in it.product(*dists):
        o, p = zip(*outcome_probs)
        outcomes[f(*o)] += reduce(lambda x, y: x * y, p)

    return Categorical(outcomes.keys(), outcomes.values())
Exemple #19
0
def partition_before(
    predicate: Callable[[Any], bool],
    seq: Sequence,
) -> Sequence[Sequence]:
    return toolz.reduce(
        lambda a, b: (*a, (b, ))
        if not a or predicate(b) else (*a[:-1], (*a[-1], b)),
        seq,
        (),
    )
Exemple #20
0
def posterior(prior, data, samples):
    """
    Returns Gaussian posterior based on prior, data and samples

    Args:
        prior: prior Gaussian distribution (e.g. Gaussian(0, sigma0))
        data: distribution of data (e.g. Gaussian(mu, sigma))
        samples: list of samples from data
    """
    return toolz.reduce(lambda prior, sample: prior.update(data, sample),
                        samples, prior)
Exemple #21
0
 def put_in(keys, coll, val):
     """Inverse of get_in, but does type promotion in the case of lists"""
     if keys:
         holder = reduce(operator.getitem, keys[:-1], coll)
         #print("Holder: ", holder)
         if isinstance(holder, tuple):
             holder = list(holder)
             coll = put_in(keys[:-1], coll, holder)
         holder[keys[-1]] = val
     else:
         coll = val
     return coll
Exemple #22
0
def init_db(chanjo_db, bed_stream, overwrite=False):
  """Build a new database instance from the Chanjo BED stream.

  Args:
    chanjo_db (Store): initialized Store class instance
    bed_stream (sequence): Chanjo-style BED-stream
    overwrite (bool, optional): whether to automatically overwrite an
      existing database, defaults to False
  """
  # check if the database already exists (expect 'mysql' to exist)
  # 'dialect' is in the form of '<db_type>+<connector>'
  if chanjo_db.dialect == 'mysql' or path(chanjo_db.uri).exists():
    if overwrite:
      # wipe the database clean with a warning
      chanjo_db.tare_down()
    elif chanjo_db.dialect == 'sqlite':
      # prevent from wiping existing database to easily
      raise OSError(errno.EEXIST, chanjo_db.uri)

  # set up new tables
  chanjo_db.set_up()

  superblocks = pipe(
    bed_stream,
    map(text_type.rstrip),
    map(split(sep='\t')),
    map(lambda row: bed_to_interval(*row)),
    map(build_interval(chanjo_db)),
    concat,
    aggregate,
    map(build_block(chanjo_db)),
    aggregate,
    map(build_superblock(chanjo_db))
  )

  # reduce the superblocks and commit every contig
  reduce(commit_per_contig(chanjo_db), superblocks, 'chr0')

  # commit also the last contig
  chanjo_db.save()
Exemple #23
0
 def _set_projects(self, projects):
     total_cost = reduce(lambda x, y: x + y, projects.values())
     for project, cost in projects.items():
         ratio = cost / total_cost
         allocation = self.cost_allocation * ratio
         labor_allocation = self.labor_allocation * ratio
         self.append(
             'projects', {
                 'project': project,
                 'cost': cost,
                 'ratio': ratio,
                 'allocation': allocation,
                 'labor_allocation': labor_allocation
             })
     self.total_cost = total_cost
Exemple #24
0
  def get(self, dot_key, default=None, scope=None):
    """Get nested value using a dot separated key.

    Args:
      dot_key (str): key on the format "section.subsection.key"
      default (object, optional): default unless key exists
      scope (dict, optional): nested dict to decend into

    Returns:
      object: value for the key or the default object
    """
    if scope is None:
      scope = self

    return reduce(rget(default=default), dot_key.split('.'), scope)
def smax(dists, default=__no_default__):
    if len(dists) == 0:
        if default is not __no_default__:
            return default
        else:
            raise ValueError('dmax() arg is an empty sequence')
    elif len(dists) == 1:
        return dists[0]
    elif len(dists) == 2:
        a, b = dists[0]._samples, dists[1]._samples
        if a[0] == b[0]:  # the same samples
            b = np.random.permutation(b)
        return SampleDist(np.maximum(a, b))
    else:
        raise NotImplementedError()
        return SampleDist(reduce(np.maximum, [d._samples for d in dists]))
Exemple #26
0
def crt(busses):
    # Problem I want to solve:
    # Find x such that for all i:
    #      x + offset_i = 0 (mod busID_i)
    #   => x = -offset_i (mod busID_i) = busID_i + offset_i (mod busID_i)
    #
    # All bus IDs are prime, so use chinese remainder theorem to solve:
    #      x = sum_i  (m_i-r_i) * N_i * s_i
    # where m_i = busID_i (the modulus), r_i = offset_i, N = m_1 * m_2 * ... * m_n,
    # N_i = N / m_i and finally s_i is the inverse of N_i mod m_i, i.e. s_i * N_i = 1 (mod m_i)

    N = toolz.reduce(lambda a, b: a * b,
                     map(toolz.last,
                         busses))  # product of all moduli (the bus numbers)

    return sum((m - r) * N // m * pow(N // m, -1, m) for r, m in busses) % N
def _sum_employee_timesheets(employee_timesheets):
    sorted_keys = _get_sorted_keys(employee_timesheets.keys())

    timesheets_data = []
    for employee in sorted_keys:
        timesheets = employee_timesheets[employee]

        timesheet_row = reduce(_sum_timesheets, timesheets,
                               _new_timesheet_row())

        first_timesheet = timesheets[0]

        timesheet_row['employee'] = first_timesheet.get('employee')
        timesheet_row['employee_name'] = first_timesheet.get('employee_name')

        timesheets_data.append(timesheet_row)

    return timesheets_data
Exemple #28
0
def stack(features):
    """
    Stack features. Basically take in a list containing
    tuples of subsets and histogram based features and 
    stack them all up. 

    Parameters
    ----------
    features : list
        List of type ``[([subsets], [features]), ...]``

    Returns
    -------
    tuple
        tuple of type ``([all_subsets],[all_features])``
    """
    def _stack(entry1, entry2):
        return (entry1[0] + entry2[0], entry1[1] + entry2[1])

    return fp.reduce(_stack, features)
Exemple #29
0
    def test_basic(self):
        from deepmerge import always_merger
        import toolz

        d = {
            "a/n/m": {
                "x": 1,
                "y/hola": 2
            },
            "a/n/m/x/t": 10,
            "b": {
                "z": 3,
                "k": 5
            }
        }
        ds = list(utils.split(d))

        dn = toolz.reduce(always_merger.merge, ds, {})

        print(ds)
        print(dn)
def stack(*imgs, **kwargs):
    """Combine images together, overlaying later images onto earlier ones.

    Parameters
    ----------
    imgs : iterable of Image
        The images to combine.
    how : str, optional
        The compositing operator to combine pixels. Default is `'over'`.
    """
    if not imgs:
        raise ValueError("No images passed in")
    for i in imgs:
        if not isinstance(i, Image):
            raise TypeError("Expected `Image`, got: `{0}`".format(type(i)))
    op = composite_op_lookup[kwargs.get('how', 'over')]
    if len(imgs) == 1:
        return imgs[0]
    imgs = xr.align(*imgs, copy=False, join='outer')
    out = tz.reduce(tz.flip(op), [i.data for i in imgs])
    return Image(out, coords=imgs[0].coords, dims=imgs[0].dims)
Exemple #31
0
def stack(*imgs, **kwargs):
    """Combine images together, overlaying later images onto earlier ones.

    Parameters
    ----------
    imgs : iterable of Image
        The images to combine.
    how : str, optional
        The compositing operator to combine pixels. Default is `'over'`.
    """
    if not imgs:
        raise ValueError("No images passed in")
    for i in imgs:
        if not isinstance(i, Image):
            raise TypeError("Expected `Image`, got: `{0}`".format(type(i)))
    op = composite_op_lookup[kwargs.get('how', 'over')]
    if len(imgs) == 1:
        return imgs[0]
    imgs = xr.align(*imgs, copy=False, join='outer')
    out = tz.reduce(tz.flip(op), [i.data for i in imgs])
    return Image(out, coords=imgs[0].coords, dims=imgs[0].dims)
Exemple #32
0
def column_list(tables, columns):
    """
    Take a list of tables and a list of column names and return the columns
    that are present in the tables.

    Parameters
    ----------
    tables : sequence of _DataFrameWrapper or _TableFuncWrapper
        Could also be sequence of modified pandas.DataFrames, the important
        thing is that they have ``.name`` and ``.columns`` attributes.
    columns : sequence of str
        The column names of interest.

    Returns
    -------
    cols : list
        Lists of column names available in the tables.

    """
    columns = set(columns)
    foundcols = toolz.reduce(lambda x, y: x.union(y), (set(t.columns) for t in tables))
    return list(columns.intersection(foundcols))
Exemple #33
0
def compute_one(expr, c, **kwargs):
    c = iter(c)
    n = 0
    cs = []
    for chunk in c:
        cs.append(chunk)
        n += len(chunk)
        if n >= expr.n:
            break

    if not cs:
        return []

    if len(cs) == 1:
        return compute_one(expr, cs[0])

    t1 = TableSymbol('t1', expr.schema)
    t2 = TableSymbol('t2', expr.schema)
    binop = lambda a, b: compute(union(t1, t2), {t1: a, t2: b})
    u = reduce(binop, cs)

    return compute_one(expr, u)
Exemple #34
0
def column_list(tables, columns):
    """
    Take a list of tables and a list of column names and return the columns
    that are present in the tables.

    Parameters
    ----------
    tables : sequence of _DataFrameWrapper or _TableFuncWrapper
        Could also be sequence of modified pandas.DataFrames, the important
        thing is that they have ``.name`` and ``.columns`` attributes.
    columns : sequence of str
        The column names of interest.

    Returns
    -------
    cols : list
        Lists of column names available in the tables.

    """
    columns = set(columns)
    foundcols = toolz.reduce(lambda x, y: x.union(y),
                             (set(t.columns) for t in tables))
    return list(columns.intersection(foundcols))
Exemple #35
0
def test_qsgd_and_terngrad():
    n = 50
    x = np.random.rand(n)
    x = torch.Tensor(x)
    code = codings.QSGD()
    codes = [codings.QSGD(scheme=scheme) for scheme in ['terngrad', 'qsgd']]
    for code in codes:
        repeats = int(10e3)
        codes = [code.encode(x, scheme=code.scheme) for _ in range(repeats)]
        code.codes = codes

        approxs = [code.decode(x).cpu().numpy() for x in codes]

        data = map(lambda arg: {'y': arg[1], 'norm(y)**2': LA.norm(arg[1])**2,
                                'len(signs)': len(arg[0]['signs'])},
                   zip(codes, approxs))
        sums = reduce(lambda x, y: {k: x[k] + y[k] for k in x}, data)
        avg = {k: v / len(codes) for k, v in sums.items()}
        assert avg['norm(y)**2'] <= np.sqrt(n) * LA.norm(x)**2
        if code.scheme == 'qsgd':
            assert avg['len(signs)'] <= np.sqrt(n)
        rel_error = LA.norm(avg['y'] - x) / LA.norm(x)
        print(code.scheme, rel_error)
        assert rel_error < 0.25
Exemple #36
0
def find_merge_rechunk(old_chunks, new_chunks, block_size_limit):
    """
    Find an intermediate rechunk that would merge some adjacent blocks
    together in order to get us nearer the *new_chunks* target, without
    violating the *block_size_limit* (in number of elements).
    """
    ndim = len(old_chunks)

    old_largest_width = [max(c) for c in old_chunks]
    new_largest_width = [max(c) for c in new_chunks]

    graph_size_effect = {
        dim: len(nc) / len(oc)
        for dim, (oc, nc) in enumerate(zip(old_chunks, new_chunks))
    }

    block_size_effect = {
        dim: new_largest_width[dim] / old_largest_width[dim]
        for dim in range(ndim)
    }

    # Our goal is to reduce the number of nodes in the rechunk graph
    # by merging some adjacent chunks, so consider dimensions where we can
    # reduce the # of chunks
    merge_candidates = [
        dim for dim in range(ndim) if graph_size_effect[dim] <= 1.0
    ]

    # Merging along each dimension reduces the graph size by a certain factor
    # and increases memory largest block size by a certain factor.
    # We want to optimize the graph size while staying below the given
    # block_size_limit.  This is in effect a knapsack problem, except with
    # multiplicative values and weights.  Just use a greedy algorithm
    # by trying dimensions in decreasing value / weight order.
    def key(k):
        gse = graph_size_effect[k]
        bse = block_size_effect[k]
        if bse == 1:
            bse = 1 + 1e-9
        return np.log(gse) / np.log(bse)

    sorted_candidates = sorted(merge_candidates, key=key)

    largest_block_size = reduce(mul, old_largest_width)

    chunks = list(old_chunks)
    memory_limit_hit = False

    for dim in sorted_candidates:
        # Examine this dimension for possible graph reduction
        new_largest_block_size = (largest_block_size *
                                  new_largest_width[dim] //
                                  old_largest_width[dim])
        if new_largest_block_size <= block_size_limit:
            # Full replacement by new chunks is possible
            chunks[dim] = new_chunks[dim]
            largest_block_size = new_largest_block_size
        else:
            # Try a partial rechunk, dividing the new chunks into
            # smaller pieces
            largest_width = old_largest_width[dim]
            chunk_limit = int(block_size_limit * largest_width /
                              largest_block_size)
            c = divide_to_width(new_chunks[dim], chunk_limit)
            if len(c) <= len(old_chunks[dim]):
                # We manage to reduce the number of blocks, so do it
                chunks[dim] = c
                largest_block_size = largest_block_size * max(
                    c) // largest_width

            memory_limit_hit = True

    assert largest_block_size == _largest_block_size(chunks)
    assert largest_block_size <= block_size_limit
    return tuple(chunks), memory_limit_hit
Exemple #37
0
def _largest_block_size(chunks):
    return reduce(mul, map(max, chunks))
Exemple #38
0
def _number_of_blocks(chunks):
    return reduce(mul, map(len, chunks))
@pytest.mark.parametrize("string,expected",
                         [("foo-bar", []),
                          ("foobazbar", []),
                          ("foo*bar*baz", ["foo_bar_baz"]),
                          ])
def test__tri_gram(string, expected):
    assert(list(tkn.tri_gram(string)) == expected)


sum_tally_tuples = lambda tpls: reduce_c(lambda x, y: x+y[1], tpls, 0)


@pytest.mark.parametrize("string,length,total,parser",
                         [(tlz.reduce(lambda x, y: x+y,
                                      ["aaa " * 20,
                                       "bbb " * 10,
                                       "ccc " * 3,
                                       "ddd " * 1],
                                      ), 4, 34, tkn.uni_gram),

                          ])
def test___bag_of_words(string, length, total, parser):
    bow = tkn.bag_of_words(parser, string)
    assert(len(bow) == length)
    assert(sum_tally_tuples(bow) == total)


@pytest.mark.parametrize("string,length,total",
                         [(tlz.reduce(lambda x, y: x+y,
                                      ["aaa " * 20,
                                       "bbb " * 10,
                                       "ccc " * 3,
Exemple #40
0
def Filter(t, *conditions):
    return t[reduce(and_, conditions)]
Exemple #41
0
def sparse_sum(l):
    return reduce(lambda a, b: tf.sparse_add(a, b), l)
def test__bi_gram(string, expected):
    assert(list(tkn.bi_gram(string)) == expected)


@pytest.mark.parametrize("string,expected",
                         [("foo-bar", []),
                          ("foobazbar", []),
                          ("foo*bar*baz", ["foo_bar_baz"]),
                          ])
def test__tri_gram(string, expected):
    assert(list(tkn.tri_gram(string)) == expected)


sum_tally_tuples = lambda tpls: reduce_c(lambda x, y: x+y[1], tpls, 0)
extext = tlz.reduce(lambda x, y: x+y, ["aaa " * 20,
                                       "bbb " * 10,
                                       "ccc " * 3,
                                       "ddd " * 1])


@pytest.mark.parametrize("string,length,total,parser",
                         [(extext, 4, 34, tkn.uni_gram),

                          ])
def test___gram_counts(string, length, total, parser):
    bow = tkn.gram_counts(parser, string)
    assert(len(bow) == length)
    assert(sum_tally_tuples(bow) == total)


@pytest.mark.parametrize("string,length,total",
                         [(extext, 4, 34),
Exemple #43
0
def _number_of_blocks(chunks):
    return reduce(mul, map(len, chunks))
Exemple #44
0
def _largest_block_size(chunks):
    return reduce(mul, map(max, chunks))
Exemple #45
0
def find_merge_rechunk(old_chunks, new_chunks, block_size_limit):
    """
    Find an intermediate rechunk that would merge some adjacent blocks
    together in order to get us nearer the *new_chunks* target, without
    violating the *block_size_limit* (in number of elements).
    """
    ndim = len(old_chunks)

    old_largest_width = [max(c) for c in old_chunks]
    new_largest_width = [max(c) for c in new_chunks]

    graph_size_effect = {
        dim: len(nc) / len(oc)
        for dim, (oc, nc) in enumerate(zip(old_chunks, new_chunks))
    }

    block_size_effect = {
        dim: new_largest_width[dim] / old_largest_width[dim]
        for dim in range(ndim)
    }

    # Our goal is to reduce the number of nodes in the rechunk graph
    # by merging some adjacent chunks, so consider dimensions where we can
    # reduce the # of chunks
    merge_candidates = [dim for dim in range(ndim)
                        if graph_size_effect[dim] <= 1.0]

    # Merging along each dimension reduces the graph size by a certain factor
    # and increases memory largest block size by a certain factor.
    # We want to optimize the graph size while staying below the given
    # block_size_limit.  This is in effect a knapsack problem, except with
    # multiplicative values and weights.  Just use a greedy algorithm
    # by trying dimensions in decreasing value / weight order.
    def key(k):
        gse = graph_size_effect[k]
        bse = block_size_effect[k]
        if bse == 1:
            bse = 1 + 1e-9
        return np.log(gse) / np.log(bse)

    sorted_candidates = sorted(merge_candidates, key=key)

    largest_block_size = reduce(mul, old_largest_width)

    chunks = list(old_chunks)
    memory_limit_hit = False

    for dim in sorted_candidates:
        # Examine this dimension for possible graph reduction
        new_largest_block_size = (
            largest_block_size * new_largest_width[dim] // old_largest_width[dim])
        if new_largest_block_size <= block_size_limit:
            # Full replacement by new chunks is possible
            chunks[dim] = new_chunks[dim]
            largest_block_size = new_largest_block_size
        else:
            # Try a partial rechunk, dividing the new chunks into
            # smaller pieces
            largest_width = old_largest_width[dim]
            chunk_limit = int(block_size_limit * largest_width / largest_block_size)
            c = divide_to_width(new_chunks[dim], chunk_limit)
            if len(c) <= len(old_chunks[dim]):
                # We manage to reduce the number of blocks, so do it
                chunks[dim] = c
                largest_block_size = largest_block_size * max(c) // largest_width

            memory_limit_hit = True

    assert largest_block_size == _largest_block_size(chunks)
    assert largest_block_size <= block_size_limit
    return tuple(chunks), memory_limit_hit
Exemple #46
0
def _reduce(binop, sequence, initial=no_default):
    if initial is not no_default:
        return reduce(binop, sequence, initial)
    else:
        return reduce(binop, sequence)