Beispiel #1
0
def from_imperative(dfs, metadata=None, divisions=None, columns=None):
    """ Create DataFrame from many imperative objects

    Parameters
    ----------
    dfs: list of Values
        An iterable of dask.imperative.Value objects, such as come from dask.do
        These comprise the individual partitions of the resulting dataframe
    metadata: list or string of column names or empty dataframe
    divisions: list or None
    """
    if columns is not None:
        print("Deprecation warning: Use metadata argument, not columns")
        metadata = columns
    from dask.imperative import Value
    if isinstance(dfs, Value):
        dfs = [dfs]
    dsk = merge(df.dask for df in dfs)

    name = 'from-imperative-' + tokenize(*dfs)
    names = [(name, i) for i in range(len(dfs))]
    values = [df.key for df in dfs]
    dsk2 = dict(zip(names, values))

    if divisions is None:
        divisions = [None] * (len(dfs) + 1)

    if isinstance(metadata, str):
        return Series(merge(dsk, dsk2), name, metadata, divisions)
    else:
        return DataFrame(merge(dsk, dsk2), name, metadata, divisions)
Beispiel #2
0
def test_to_task_dask():
    a = delayed(1, name='a')
    b = delayed(2, name='b')
    task, dask = to_task_dask([a, b, 3])
    assert task == ['a', 'b', 3]

    task, dask = to_task_dask((a, b, 3))
    assert task == (tuple, ['a', 'b', 3])
    assert dict(dask) == merge(a.dask, b.dask)

    task, dask = to_task_dask({a: 1, b: 2})
    assert (task == (dict, [['b', 2], ['a', 1]]) or
            task == (dict, [['a', 1], ['b', 2]]))
    assert dict(dask) == merge(a.dask, b.dask)

    f = namedtuple('f', ['x', 'y'])
    x = f(1, 2)
    task, dask = to_task_dask(x)
    assert task == x
    assert dict(dask) == {}

    # Issue https://github.com/dask/dask/issues/2107
    class MyClass(dict):
        pass

    task, dask = to_task_dask(MyClass())
    assert type(task) is MyClass
    assert dict(dask) == {}
Beispiel #3
0
 def __getitem__(self, key):
     if isinstance(key, (str, unicode)):
         name = self._name + '.' + key
         if key in self.columns:
             dsk = dict(((name, i), (operator.getitem, (self._name, i), key))
                         for i in range(self.npartitions))
             return Series(merge(self.dask, dsk), name,
                           key, self.divisions)
     if isinstance(key, list):
         name = '%s[%s]' % (self._name, str(key))
         if all(k in self.columns for k in key):
             dsk = dict(((name, i), (operator.getitem,
                                      (self._name, i),
                                      (list, key)))
                         for i in range(self.npartitions))
             return DataFrame(merge(self.dask, dsk), name,
                              key, self.divisions)
     if isinstance(key, Series) and self.divisions == key.divisions:
         name = next(names)
         dsk = dict(((name, i), (operator.getitem, (self._name, i),
                                                    (key._name, i)))
                     for i in range(self.npartitions))
         return DataFrame(merge(self.dask, key.dask, dsk), name,
                          self.columns, self.divisions)
     raise NotImplementedError()
Beispiel #4
0
    def _loc(self, ind):
        """ Helper function for the .loc accessor """
        if not self.known_divisions:
            raise ValueError(
                "Can not use loc on DataFrame without known divisions")
        name = next(names)
        if not isinstance(ind, slice):
            part = self._partition_of_index_value(ind)
            dsk = {(name, 0): (lambda df: df.loc[ind], (self._name, part))}
            return type(self)(merge(self.dask, dsk), name,
                              self.column_info, [])
        else:
            assert ind.step in (None, 1)
            if ind.start:
                start = self._partition_of_index_value(ind.start)
            else:
                start = 0
            if ind.stop is not None:
                stop = self._partition_of_index_value(ind.stop)
            else:
                stop = self.npartitions - 1
            if stop == start:
                dsk = {(name, 0): (_loc, (self._name, start), ind.start, ind.stop)}
            else:
                dsk = merge(
                  {(name, 0): (_loc, (self._name, start), ind.start, None)},
                  dict(((name, i), (self._name, start + i))
                      for i in range(1, stop - start)),
                  {(name, stop - start): (_loc, (self._name, stop), None, ind.stop)})

            return type(self)(merge(self.dask, dsk), name, self.column_info,
                              self.divisions[start:stop])
Beispiel #5
0
def elemwise(op, *args, **kwargs):
    """ Elementwise operation for dask.Dataframes """
    columns = kwargs.get('columns', None)
    name = kwargs.get('name', None)

    _name = next(names)

    frames = [arg for arg in args if isinstance(arg, _Frame)]
    other = [(i, arg) for i, arg in enumerate(args)
                      if not isinstance(arg, _Frame)]

    if other:
        op2 = partial_by_order(op, other)
    else:
        op2 = op

    assert all(f.divisions == frames[0].divisions for f in frames)
    assert all(f.npartitions == frames[0].npartitions for f in frames)

    dsk = dict(((_name, i), (op2,) + frs)
                for i, frs in enumerate(zip(*[f._keys() for f in frames])))

    if columns is not None:
        return DataFrame(merge(dsk, *[f.dask for f in frames]),
                         _name, columns, frames[0].divisions)
    else:
        column_name = name or consistent_name(n for f in frames
                                                 for n in f.columns)
        return Series(merge(dsk, *[f.dask for f in frames]),
                      _name, column_name, frames[0].divisions)
Beispiel #6
0
def to_hdf(df, path_or_buf, key, mode='a', append=False, complevel=0,
           complib=None, fletcher32=False, get=get_sync, dask_kwargs=None,
           name_function=None, compute=True, **kwargs):
    name = 'to-hdf-' + uuid.uuid1().hex

    pd_to_hdf = getattr(df._partition_type, 'to_hdf')

    # if path_or_buf is string, format using i_name
    if isinstance(path_or_buf, str):
        if path_or_buf.count('*') + key.count('*') > 1:
            raise ValueError("A maximum of one asterisk is accepted in file path and dataset key")

        fmt_obj = lambda path_or_buf, i_name: path_or_buf.replace('*', i_name)
    else:
        if key.count('*') > 1:
            raise ValueError("A maximum of one asterisk is accepted in dataset key")

        fmt_obj = lambda path_or_buf, _: path_or_buf

    if name_function is None:
        name_function = build_name_function(df.npartitions - 1)

    # we guarantee partition order is preserved when its saved and read
    # so we enforce name_function to maintain the order of its input.
    if '*' in key or (isinstance(path_or_buf, str) and '*' in path_or_buf):
        formatted_names = [name_function(i) for i in range(df.npartitions)]
        if formatted_names != sorted(formatted_names):
            warn("In order to preserve order between partitions "
                 "name_function must preserve the order of its input")

    dsk = dict()
    i_name = name_function(0)
    dsk[(name, 0)] = (_link, None,
                      (apply, pd_to_hdf,
                          (tuple, [(df._name, 0), fmt_obj(path_or_buf, i_name),
                              key.replace('*', i_name)]),
                          merge(kwargs,
                            {'mode':  mode, 'format': 'table', 'append': append,
                             'complevel': complevel, 'complib': complib,
                             'fletcher32': fletcher32})))
    for i in range(1, df.npartitions):
        i_name = name_function(i)
        dsk[(name, i)] = (_link, (name, i - 1),
                          (apply, pd_to_hdf,
                           (tuple, [(df._name, i), fmt_obj(path_or_buf, i_name),
                               key.replace('*', i_name)]),
                           merge(kwargs,
                             {'mode': 'a', 'format': 'table', 'append': True,
                              'complevel': complevel, 'complib': complib,
                              'fletcher32': fletcher32})))

    dask_kwargs = dask_kwargs or {}

    dsk = merge(df.dask, dsk)
    key = (name, df.npartitions - 1)

    if compute:
        return DataFrame._get(dsk, key, get=get, **dask_kwargs)
    else:
        return Delayed(key, [dsk])
Beispiel #7
0
def set_partition(f, index, divisions, get=threaded.get, **kwargs):
    """ Set new partitioning along index given divisions """
    divisions = unique(divisions)
    name = next(names)
    if isinstance(index, Series):
        assert index.divisions == f.divisions
        dsk = dict(((name, i), (f._partition_type.set_index, block, ind))
                for i, (block, ind) in enumerate(zip(f._keys(), index._keys())))
        f2 = type(f)(merge(f.dask, index.dask, dsk), name,
                       f.column_info, f.divisions)
    else:
        dsk = dict(((name, i), (f._partition_type.set_index, block, index))
                for i, block in enumerate(f._keys()))
        f2 = type(f)(merge(f.dask, dsk), name, f.column_info, f.divisions)

    head = f2.head()
    pf = pframe(like=head, divisions=divisions, **kwargs)

    def append(block):
        pf.append(block)
        return 0

    f2.map_blocks(append).compute(get=get)
    pf.flush()

    return from_pframe(pf)
Beispiel #8
0
def from_imperative(dfs, columns, divisions=None):
    """ Create DataFrame from many imperative objects

    Parameters
    ----------
    dfs: list of Values
        An iterable of dask.imperative.Value objects, such as come from dask.do
        These comprise the individual partitions of the resulting dataframe
    columns: list or string
        The list of column names if the result is a DataFrame
        Or the single column name if the result is a Series
    divisions: list or None
    """
    from dask.imperative import Value
    if isinstance(dfs, Value):
        dfs = [dfs]
    dsk = merge(df.dask for df in dfs)

    name = 'from-imperative-' + tokenize(*dfs)
    names = [(name, i) for i in range(len(dfs))]
    values = [df.key for df in dfs]
    dsk2 = dict(zip(names, values))

    if divisions is None:
        divisions = [None] * (len(dfs) + 1)

    if isinstance(columns, str):
        return Series(merge(dsk, dsk2), name, columns, divisions)
    else:
        return DataFrame(merge(dsk, dsk2), name, columns, divisions)
    def apply(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0]
        longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1]
        latitude_mask = latitude_mask.T

        rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]),
                                    axis=2)
        path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}),
                              merge(self.bkwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}))[0]

        last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64')
        
        path_representation = (path[0][:, -self.config.rec_state_dim:],
                path[last_id - 1, tensor.arange(last_id.shape[0])]
                    [:, :self.config.rec_state_dim])

        embeddings = tuple(self.context_embedder.apply(
                            **{k: kwargs[k] for k in self.context_embedder.inputs }))

        inputs = tensor.concatenate(path_representation + embeddings, axis=1)
        outputs = self.rec_to_output.apply(inputs)

        return outputs
Beispiel #10
0
    def f(c, a, b):
        data = yield _scatter((c.ip, c.port), [1, 2, 3])

        assert c.ip in str(data[0])
        assert c.ip in repr(data[0])

        assert merge(a.data, b.data) == \
                {d.key: i for d, i in zip(data, [1, 2, 3])}

        assert set(c.who_has) == {d.key for d in data}
        assert all(len(v) == 1 for v in c.who_has.values())

        result = yield [d._get() for d in data]
        assert result == [1, 2, 3]

        yield data[0]._delete()

        assert merge(a.data, b.data) == \
                {d.key: i for d, i in zip(data[1:], [2, 3])}

        assert data[0].key not in c.who_has

        data = yield scatter_to_workers((c.ip, c.port), [a.address, b.address],
                                        [4, 5, 6])

        m = merge(a.data, b.data)

        for d, v in zip(data, [4, 5, 6]):
            assert m[d.key] == v

        result = yield _gather((c.ip, c.port), data)
        assert result == [4, 5, 6]
Beispiel #11
0
 def apply(self, source_sentence, source_sentence_mask):
     """Produces source annotations, either non-recurrently or with
     a bidirectional RNN architecture.
     """
     # Time as first dimension
     source_sentence = source_sentence.T
     source_sentence_mask = source_sentence_mask.T
     embeddings = self.lookup.apply(source_sentence)
     representation = self.bidirs[0].apply(
             merge(self.fwd_forks[0].apply(embeddings, as_dict=True),
                   {'mask': source_sentence_mask}),
             merge(self.back_forks[0].apply(embeddings, as_dict=True),
                   {'mask': source_sentence_mask}))
     for i in xrange(1, self.n_layers):
         if self.skip_connections:
             inp = tensor.concatenate([representation, embeddings],
                                      axis=2)
         else:
             inp = representation
         representation = self.bidirs[i].apply(
             merge(self.fwd_forks[i].apply(inp, as_dict=True),
                   {'mask': source_sentence_mask}),
             merge(self.back_forks[i].apply(inp, as_dict=True),
                   {'mask': source_sentence_mask})
         )
     return representation, source_sentence_mask
Beispiel #12
0
def to_hdf(df, path_or_buf, key, mode='a', append=False, complevel=0,
           complib=None, fletcher32=False, get=get_sync, dask_kwargs=None,
           **kwargs):
    name = 'to-hdf-' + uuid.uuid1().hex

    pd_to_hdf = getattr(df._partition_type, 'to_hdf')

    dsk = dict()
    dsk[(name, 0)] = (_link, None,
                      (apply, pd_to_hdf,
                          (tuple, [(df._name, 0), path_or_buf, key]),
                          merge(kwargs,
                            {'mode':  mode, 'format': 'table', 'append': append,
                             'complevel': complevel, 'complib': complib,
                             'fletcher32': fletcher32})))
    for i in range(1, df.npartitions):
        dsk[(name, i)] = (_link, (name, i - 1),
                          (apply, pd_to_hdf,
                           (tuple, [(df._name, i), path_or_buf, key]),
                           merge(kwargs,
                             {'mode': 'a', 'format': 'table', 'append': True,
                              'complevel': complevel, 'complib': complib,
                              'fletcher32': fletcher32})))

    dask_kwargs = dask_kwargs or {}

    DataFrame._get(merge(df.dask, dsk), (name, df.npartitions - 1),
                   get=get, **dask_kwargs)
Beispiel #13
0
def elemwise(op, *args, **kwargs):
    """ Elementwise operation for dask.Dataframes """
    columns = kwargs.get('columns', None)
    name = kwargs.get('name', None)

    _name = 'elemwise' + next(tokens)

    dfs = [arg for arg in args if isinstance(arg, _Frame)]
    other = [(i, arg) for i, arg in enumerate(args)
                      if not isinstance(arg, _Frame)]

    if other:
        op2 = partial_by_order(op, other)
    else:
        op2 = op

    if not all(df.divisions == dfs[0].divisions for df in dfs):
        msg = 'All dask.Dataframe and dask.Series must have same divisions'
        raise ValueError(msg)
    if not all(df.npartitions == dfs[0].npartitions for df in dfs):
        msg = 'All dask.Dataframe and dask.Series must have same npartitions'
        raise ValueError(msg)

    dsk = dict(((_name, i), (op2,) + frs)
                for i, frs in enumerate(zip(*[df._keys() for df in dfs])))
    if columns is not None:
        return DataFrame(merge(dsk, *[df.dask for df in dfs]),
                         _name, columns, dfs[0].divisions)
    else:
        column_name = name or consistent_name(n for df in dfs
                                              for n in df.columns)
        return Series(merge(dsk, *[df.dask for df in dfs]),
                      _name, column_name, dfs[0].divisions)
Beispiel #14
0
    def _loc_slice(self, ind):
        name = 'loc-slice' + next(tokens)
        assert ind.step in (None, 1)
        if ind.start:
            start = _partition_of_index_value(self.divisions, ind.start)
        else:
            start = 0
        if ind.stop is not None:
            stop = _partition_of_index_value(self.divisions, ind.stop)
        else:
            stop = self.npartitions - 1
        istart = _coerce_loc_index(self.divisions, ind.start)
        istop = _coerce_loc_index(self.divisions, ind.stop)
        if stop == start:
            dsk = {(name, 0): (_loc, (self._name, start), ind.start, ind.stop)}
            divisions = [istart, istop]
        else:
            dsk = merge(
              {(name, 0): (_loc, (self._name, start), ind.start, None)},
              dict(((name, i), (self._name, start + i))
                  for i in range(1, stop - start)),
              {(name, stop - start): (_loc, (self._name, stop), None, ind.stop)})

            divisions = ((max(istart, self.divisions[start])
                          if ind.start is not None
                          else self.divisions[0],) +
                         self.divisions[start+1:stop+1] +
                         (min(istop, self.divisions[stop+1])
                          if ind.stop is not None
                          else self.divisions[-1],))

        assert len(divisions) == len(dsk) + 1
        return type(self)(merge(self.dask, dsk),
                          name, self.column_info,
                          divisions)
Beispiel #15
0
    def f(c, a, b):
        keys = yield _scatter((c.ip, c.port), [1, 2, 3])

        assert merge(a.data, b.data) == \
                {k: i for k, i in zip(keys, [1, 2, 3])}

        assert set(c.who_has) == set(keys)
        assert all(len(v) == 1 for v in c.who_has.values())

        keys2, who_has, nbytes = yield scatter_to_workers([a.address, b.address],
                                                          [4, 5, 6])

        m = merge(a.data, b.data)

        for k, v in zip(keys2, [4, 5, 6]):
            assert m[k] == v

        assert isinstance(who_has, dict)
        assert set(concat(who_has.values())) == {a.address, b.address}
        assert len(who_has) == len(keys2)

        assert isinstance(nbytes, dict)
        assert set(nbytes) == set(who_has)
        assert all(isinstance(v, int) for v in nbytes.values())

        result = yield _gather((c.ip, c.port), keys2)
        assert result == [4, 5, 6]
Beispiel #16
0
def from_imperative(values):
    """ Create bag from many imperative objects

    Parameters
    ----------
    values: list of Values
        An iterable of dask.imperative.Value objects, such as come from dask.do
        These comprise the individual partitions of the resulting bag

    Returns
    -------
    Bag

    Examples
    --------
    >>> b = from_imperative([x, y, z])  # doctest: +SKIP
    """
    from dask.imperative import Value
    if isinstance(values, Value):
        values = [values]
    dsk = merge(v.dask for v in values)

    name = 'bag-from-imperative-' + tokenize(*values)
    names = [(name, i) for i in range(len(values))]
    values = [v.key for v in values]
    dsk2 = dict(zip(names, values))

    return Bag(merge(dsk, dsk2), name, len(values))
Beispiel #17
0
def read_csv(fn, *args, **kwargs):
    chunksize = kwargs.pop('chunksize', 2**16)
    categorize = kwargs.pop('categorize', None)
    index = kwargs.pop('index', None)
    if index and categorize == None:
        categorize = True
    header = kwargs.get('header', 1)

    nlines = linecount(fn) - header
    nchunks = int(ceil(1.0 * nlines / chunksize))

    read = next(read_csv_names)

    blockdivs = tuple(range(chunksize, nlines, chunksize))

    one_chunk = pd.read_csv(fn, *args, nrows=100, **kwargs)

    cols = []

    if categorize or index:
        if categorize:
            category_columns = [c for c in one_chunk.dtypes.index
                                   if one_chunk.dtypes[c] == 'O']
        else:
            category_columns = []
        cols = category_columns + ([index] if index else [])
        d = read_csv(fn, *args, **merge(kwargs,
                                        dict(chunksize=chunksize,
                                             usecols=cols,
                                             categorize=False,
                                             parse_dates=None)))
        categories = [d[c].drop_duplicates() for c in category_columns]
        if index:
            quantiles = d[index].quantiles(np.linspace(0, 100, nchunks + 1)[1:-1])
            result = compute(quantiles, *categories)
            quantiles, categories = result[0], result[1:]
        else:
            categories = compute(*categories)
        categories = dict(zip(category_columns, categories))

    kwargs['chunksize'] = chunksize
    load = {(read, -1): (partial(pd.read_csv, *args, **kwargs), fn)}
    load.update(dict(((read, i), (get_chunk, (read, i-1), chunksize*i))
                     for i in range(nchunks)))

    name = next(names)

    dsk = dict(((name, i), (getitem, (read, i), 0))
                for i in range(nchunks))

    result = DataFrame(merge(dsk, load), name, one_chunk.columns, blockdivs)

    if categorize:
        func = partial(categorize_block, categories=categories)
        result = result.map_blocks(func, columns=result.columns)

    if index:
        result = set_partition(result, index, quantiles)

    return result
Beispiel #18
0
def compute(*args, **kwargs):
    """Compute several dask collections at once.

    Examples
    --------
    >>> import dask.array as da
    >>> a = da.arange(10, chunks=2).sum()
    >>> b = da.arange(10, chunks=2).mean()
    >>> compute(a, b)
    (45, 4.5)
    """
    groups = groupby(attrgetter('_optimize'), args)
    get = kwargs.pop('get', None) or _globals['get']

    if not get:
        get = args[0]._default_get
        if not all(a._default_get == get for a in args):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val])
                for opt, val in groups.items()])
    keys = [arg._keys() for arg in args]
    results = get(dsk, keys, **kwargs)
    return tuple(a._finalize(a, r) for a, r in zip(args, results))
Beispiel #19
0
    def persist(self, collections):
        """ Persist dask collections on cluster

        Starts computation of the collection on the cluster in the background.
        Provides a new dask collection that is semantically identical to the
        previous one, but now based off of futures currently in execution.

        Parameters
        ----------
        collections: sequence or single dask object
            Collections like dask.array or dataframe or dask.value objects

        Returns
        -------
        List of collections, or single collection, depending on type of input.

        Examples
        --------
        >>> xx = executor.persist(x)  # doctest: +SKIP
        >>> xx, yy = executor.persist([x, y])  # doctest: +SKIP

        See Also
        --------
        Executor.compute
        """
        if isinstance(collections, (tuple, list, set, frozenset)):
            singleton = False
        else:
            singleton = True
            collections = [collections]

        assert all(isinstance(c, Base) for c in collections)

        groups = groupby(lambda x: x._optimize, collections)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk2.items():
            dependencies[k] |= set(_deps(dsk, v))

        names = list({k for c in collections for k in flatten(c._keys())})

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk2),
                                 'dependencies': dependencies,
                                 'keys': names,
                                 'client': self.id})
        result = [redict_collection(c, {k: Future(k, self)
                                        for k in flatten(c._keys())})
                for c in collections]
        if singleton:
            return first(result)
        else:
            return result
Beispiel #20
0
    def compute(self, *args, **kwargs):
        """ Compute dask collections on cluster

        Parameters
        ----------
        args: iterable of dask objects
            Collections like dask.array or dataframe or dask.value objects
        sync: bool (optional)
            Returns Futures if False (default) or concrete values if True

        Returns
        -------
        Tuple of Futures or concrete values

        Examples
        --------

        >>> from dask import do, value
        >>> from operator import add
        >>> x = dask.do(add)(1, 2)
        >>> y = dask.do(add)(x, x)
        >>> xx, yy = executor.compute(x, y)  # doctest: +SKIP
        >>> xx  # doctest: +SKIP
        <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e>
        >>> xx.result()  # doctest: +SKIP
        3
        >>> yy.result()  # doctest: +SKIP
        6
        """
        sync = kwargs.pop('sync', False)
        assert not kwargs
        if sync:
            return dask.compute(*args, get=self.get)

        variables = [a for a in args if isinstance(a, Base)]

        groups = groupby(lambda x: x._optimize, variables)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])
        names = ['finalize-%s' % tokenize(v) for v in variables]
        dsk2 = {name: (v._finalize, v, v._keys()) for name, v in zip(names, variables)}

        self.loop.add_callback(self.scheduler_queue.put_nowait,
                                {'op': 'update-graph',
                                'dsk': merge(dsk, dsk2),
                                'keys': names})

        i = 0
        futures = []
        for arg in args:
            if isinstance(arg, Base):
                futures.append(Future(names[i], self))
                i += 1
            else:
                futures.append(arg)

        return futures
Beispiel #21
0
def from_dask_array(x, columns=None):
    """ Convert dask Array to dask DataFrame

    Converts a 2d array into a DataFrame and a 1d array into a Series.

    Parameters
    ----------
    x: da.Array
    columns: list or string
        list of column names if DataFrame, single string if Series

    Example
    -------

    >>> import dask.array as da
    >>> import dask.dataframe as dd
    >>> x = da.ones((4, 2), chunks=(2, 2))
    >>> df = dd.io.from_dask_array(x, columns=['a', 'b'])
    >>> df.compute()
       a  b
    0  1  1
    1  1  1
    2  1  1
    3  1  1
    """
    name = "from-dask-array" + next(tokens)
    divisions = [0]
    for c in x.chunks[0]:
        divisions.append(divisions[-1] + c)

    index = [(range, a, b) for a, b in zip(divisions[:-1], divisions[1:])]

    divisions[-1] -= 1

    if x.ndim == 1:
        dsk = dict(
            ((name, i), (pd.Series, chunk, ind, x.dtype, columns))
            for i, (chunk, ind) in enumerate(zip(x._keys(), index))
        )
        return Series(merge(x.dask, dsk), name, columns, divisions)

    elif x.ndim == 2:
        if columns is None:
            raise ValueError("Must provide columns for DataFrame")
        if len(columns) != x.shape[1]:
            raise ValueError(
                "Columns must be the same length as array width\n"
                "  columns: %s\n  width: %d" % (str(columns), x.shape[1])
            )
        if len(x.chunks[1]) > 1:
            x = x.rechunk({1: x.shape[1]})
        dsk = dict(
            ((name, i), (pd.DataFrame, chunk[0], ind, columns)) for i, (chunk, ind) in enumerate(zip(x._keys(), index))
        )
        return DataFrame(merge(x.dask, dsk), name, columns, divisions)

    else:
        raise ValueError("Array must have one or two dimensions.  Had %d" % x.ndim)
    def apply(self, source_sentence_tbf, source_sentence_mask_tb=None):

        representation_tbf = self.bidir.apply(
            merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb}),
            merge(self.back_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb})
        )
        return representation_tbf
Beispiel #23
0
def compute(*args, **kwargs):
    """Compute several dask collections at once.

    Parameters
    ----------
    args : object
        Any number of objects. If the object is a dask collection, it's
        computed and the result is returned. Otherwise it's passed through
        unchanged.
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default is
        to check the global settings first, and then fall back to defaults for
        the collections.
    optimize_graph : bool, optional
        If True [default], the optimizations for each collection are applied
        before computation. Otherwise the graph is run as is. This can be
        useful for debugging.
    kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Examples
    --------
    >>> import dask.array as da
    >>> a = da.arange(10, chunks=2).sum()
    >>> b = da.arange(10, chunks=2).mean()
    >>> compute(a, b)
    (45, 4.5)
    """
    variables = [a for a in args if isinstance(a, Base)]
    if not variables:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if not get:
        get = variables[0]._default_get
        if not all(a._default_get == get for a in variables):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    if kwargs.get('optimize_graph', True):
        groups = groupby(attrgetter('_optimize'), variables)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val], **kwargs)
                    for opt, val in groups.items()])
    else:
        dsk = merge(var.dask for var in variables)
    keys = [var._keys() for var in variables]
    results = get(dsk, keys, **kwargs)

    results_iter = iter(results)
    return tuple(a if not isinstance(a, Base)
                 else a._finalize(next(results_iter))
                 for a in args)
Beispiel #24
0
def from_delayed(dfs, meta=None, divisions=None, prefix='from-delayed',
                 metadata=None):
    """ Create Dask DataFrame from many Dask Delayed objects

    Parameters
    ----------
    dfs : list of Delayed
        An iterable of ``dask.delayed.Delayed`` objects, such as come from
        ``dask.delayed`` These comprise the individual partitions of the
        resulting dataframe.
    $META
    divisions : tuple, str, optional
        Partition boundaries along the index.
        For tuple, see http://dask.pydata.io/en/latest/dataframe-partitions.html
        For string 'sorted' will compute the delayed values to find index
        values.  Assumes that the indexes are mutually sorted.
        If None, then won't use index information
    prefix : str, optional
        Prefix to prepend to the keys.
    """
    if metadata is not None and meta is None:
        warn("Deprecation warning: Use meta keyword, not metadata")
        meta = metadata
    from dask.delayed import Delayed
    if isinstance(dfs, Delayed):
        dfs = [dfs]
    dsk = merge(df.dask for df in dfs)

    name = prefix + '-' + tokenize(*dfs)
    names = [(name, i) for i in range(len(dfs))]
    values = [df.key for df in dfs]
    dsk2 = dict(zip(names, values))
    dsk3 = merge(dsk, dsk2)

    if meta is None:
        meta = dfs[0].compute()
    if isinstance(meta, (str, pd.Series)):
        Frame = Series
    else:
        Frame = DataFrame

    if divisions is None or divisions == 'sorted':
        divs = [None] * (len(dfs) + 1)
    else:
        divs = tuple(divisions)
        if len(divs) != len(dfs) + 1:
            raise ValueError("divisions should be a tuple of len(dfs) + 1")

    df = Frame(dsk3, name, meta, divs)

    if divisions == 'sorted':
        from ..core import compute_divisions
        divisions = compute_divisions(df)
        df.divisions = divisions

    return df
Beispiel #25
0
def read_csv(fn, *args, **kwargs):
    chunkbytes = kwargs.pop('chunkbytes', 2**25)  # 50 MB
    categorize = kwargs.pop('categorize', None)
    index = kwargs.pop('index', None)
    if index and categorize == None:
        categorize = True

    kwargs = fill_kwargs(fn, args, kwargs)

    # Handle glob strings
    if '*' in fn:
        return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))])

    token = tokenize(os.path.getmtime(fn), args, kwargs)
    name = 'read-csv-%s-%s' % (fn, token)

    columns = kwargs.pop('columns')
    header = kwargs.pop('header')

    if 'nrows' in kwargs:  # Just create single partition
        dsk = {(name, 0): (apply, pd.read_csv, (fn,),
                                  assoc(kwargs, 'header', header))}
        result = DataFrame(dsk, name, columns, [None, None])

    else:
        # Chunk sizes and numbers
        total_bytes = file_size(fn, kwargs['compression'])
        nchunks = int(ceil(total_bytes / chunkbytes))
        divisions = [None] * (nchunks + 1)

        first_kwargs = merge(kwargs, dict(header=header, compression=None))
        rest_kwargs = merge(kwargs, dict(header=None, compression=None))

        # Create dask graph
        dsk = dict(((name, i), (_read_csv, fn, i, chunkbytes,
                                           kwargs['compression'], rest_kwargs))
                   for i in range(1, nchunks))

        dsk[(name, 0)] = (_read_csv, fn, 0, chunkbytes, kwargs['compression'],
                                     first_kwargs)

        result = DataFrame(dsk, name, columns, divisions)

    if categorize or index:
        categories, quantiles = categories_and_quantiles(fn, args, kwargs,
                                                         index, categorize,
                                                         chunkbytes=chunkbytes)

    if categorize:
        func = partial(categorize_block, categories=categories)
        result = result.map_partitions(func, columns=columns)

    if index:
        result = set_partition(result, index, quantiles)

    return result
Beispiel #26
0
def gather_from_workers(who_has, rpc=rpc, close=True, permissive=False):
    """ Gather data directly from peers

    Parameters
    ----------
    who_has: dict
        Dict mapping keys to sets of workers that may have that key

    Returns dict mapping key to value

    See Also
    --------
    gather
    _gather
    """
    bad_addresses = set()
    who_has = {k: set(v) for k, v in who_has.items()}
    results = dict()
    all_bad_keys = set()

    while len(results) + len(all_bad_keys) < len(who_has):
        d = defaultdict(list)
        rev = dict()
        bad_keys = set()
        for key, addresses in who_has.items():
            if key in results:
                continue
            try:
                addr = random.choice(list(addresses - bad_addresses))
                d[addr].append(key)
                rev[key] = addr
            except IndexError:
                bad_keys.add(key)
        if bad_keys:
            if permissive:
                all_bad_keys |= bad_keys
            else:
                raise KeyError(*bad_keys)

        rpcs = {addr: rpc(addr) for addr in d}
        try:
            coroutines = [rpcs[address].get_data(keys=keys, close=close)
                          for address, keys in d.items()]
            response = yield ignore_exceptions(coroutines, EnvironmentError)
        finally:
            for r in rpcs.values():
                r.close_rpc()

        response = merge(response)
        bad_addresses |= {v for k, v in rev.items() if k not in response}
        results.update(merge(response))

    if permissive:
        raise Return((results, all_bad_keys))
    else:
        raise Return(results)
Beispiel #27
0
def _construct_dask_df_with_divisions(df):
    """Construct the new task graph and make a new dask.dataframe around it"""
    divisions = _get_divisions(df)
    name = 'csv-index' + df._name
    dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)}
    from toolz import merge
    if isinstance(df, dd.DataFrame):
        return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
    elif isinstance(df, dd.Series):
        return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
Beispiel #28
0
def annotate_fusions_for_assembly(
        fusions,  # type: Iterable[Fusion]
        reference,  # type: TranscriptReference
        assembly,  # type: TranscriptReference
        skip_annotated=True  # type: bool
):  # type: (...) -> Iterable[Fusion]
    """Annotates fusions using the assembled GTF."""

    def _exon_region(exon):
        return (exon.chromosome, exon.start, exon.end)

    for fusion in fusions:
        if skip_annotated and 'gene_id' in fusion.metadata:
            # Already annotated
            yield fusion
        else:
            # Identify overlapped transcripts.
            transcripts = assembly.overlap_transcripts(fusion.genome_region)

            if len(transcripts) > 0:
                for transcript in transcripts:
                    # Lookup genes that overlap with exons.
                    exons = assembly.get_exons(transcript.id)

                    genes = set(
                        itertools.chain.from_iterable(
                            reference.overlap_genes(_exon_region(exon))
                            for exon in exons))

                    if len(genes) > 0:
                        for gene in genes:
                            # Yield with information from overlapping genes.
                            new_meta = {
                                'gene_name': gene.name,
                                'gene_strand': gene.strand,
                                'gene_id': gene.id,
                                'novel_transcript': transcript.id
                            }
                            yield fusion._replace(metadata=toolz.merge(
                                fusion.metadata, new_meta))
                    else:
                        # No gene overlap, yield with transcript info.
                        new_meta = {
                            'gene_name': transcript.id,
                            'gene_id': transcript.id,
                            'gene_strand': transcript.strand,
                            'novel_transcript': transcript.id
                        }
                        yield fusion._replace(
                            metadata=toolz.merge(fusion.metadata, new_meta))
            else:
                # No overlap.
                yield fusion
Beispiel #29
0
def _construct_dask_df_with_divisions(df):
  """Construct the new task graph and make a new dask.dataframe around it."""
  divisions = _get_divisions(df)
  # pylint: disable=protected-access
  name = 'csv-index' + df._name
  dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i])
         for i in range(df.npartitions)}
  # pylint: enable=protected-access
  from toolz import merge  # pylint: disable=g-import-not-at-top
  if isinstance(df, dd.DataFrame):
    return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
  elif isinstance(df, dd.Series):
    return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
Beispiel #30
0
    def reduction(self, perpartition, aggregate, split_every=None,
                  out_type=Item, name=None):
        """ Reduce collection with reduction operators

        Parameters
        ----------
        perpartition: function
            reduction to apply to each partition
        aggregate: function
            reduction to apply to the results of all partitions
        split_every: int (optional)
            Group partitions into groups of this size while performing reduction
            Defaults to 8
        out_type: {Bag, Item}
            The out type of the result, Item if a single element, Bag if a list
            of elements.  Defaults to Item.

        Examples
        --------
        >>> b = from_sequence(range(10))
        >>> b.reduction(sum, sum).compute()
        45
        """
        if split_every is None:
            split_every = 8
        if split_every is False:
            split_every = self.npartitions
        token = tokenize(self, perpartition, aggregate, split_every)
        a = '%s-part-%s' % (name or funcname(perpartition), token)
        dsk = dict(((a, i), (perpartition, (self.name, i)))
                   for i in range(self.npartitions))
        k = self.npartitions
        b = a
        fmt = '%s-aggregate-%s' % (name or funcname(aggregate), token)
        depth = 0
        while k > 1:
            c = fmt + str(depth)
            dsk2 = dict(((c, i), (aggregate, [(b, j) for j in inds]))
                        for i, inds in enumerate(partition_all(split_every,
                                                               range(k))))
            dsk.update(dsk2)
            k = len(dsk2)
            b = c
            depth += 1

        if out_type is Item:
            dsk[b] = dsk.pop((b, 0))
            return Item(merge(self.dask, dsk), b)
        else:
            return Bag(merge(self.dask, dsk), b, 1)
Beispiel #31
0
    def __init__(self,
                 n_workers=None,
                 threads_per_worker=None,
                 processes=True,
                 loop=None,
                 start=None,
                 host=None,
                 ip=None,
                 scheduler_port=0,
                 silence_logs=logging.WARN,
                 dashboard_address=":8787",
                 worker_dashboard_address=None,
                 diagnostics_port=None,
                 services=None,
                 worker_services=None,
                 service_kwargs=None,
                 asynchronous=False,
                 security=None,
                 protocol=None,
                 blocked_handlers=None,
                 interface=None,
                 worker_class=None,
                 scheduler_kwargs=None,
                 **worker_kwargs):
        if ip is not None:
            # In the future we should warn users about this move
            # warnings.warn("The ip keyword has been moved to host")
            host = ip

        if diagnostics_port is not None:
            warnings.warn("diagnostics_port has been deprecated. "
                          "Please use `dashboard_address=` instead")
            dashboard_address = diagnostics_port

        if threads_per_worker == 0:
            warnings.warn(
                "Setting `threads_per_worker` to 0 is discouraged. "
                "Please set to None or to a specific int to get best behavior."
            )
            threads_per_worker = None

        if "dashboard" in worker_kwargs:
            warnings.warn(
                "Setting `dashboard` is discouraged. "
                "Please set `dashboard_address` to affect the scheduler (more common) "
                "and `worker_dashboard_address` for the worker (less common).")

        self.status = None
        self.processes = processes

        if security is None:
            # Falsey values load the default configuration
            security = Security()
        elif security is True:
            # True indicates self-signed temporary credentials should be used
            security = Security.temporary()
        elif not isinstance(security, Security):
            raise TypeError("security must be a Security object")

        if protocol is None:
            if host and "://" in host:
                protocol = host.split("://")[0]
            elif security and security.require_encryption:
                protocol = "tls://"
            elif not self.processes and not scheduler_port:
                protocol = "inproc://"
            else:
                protocol = "tcp://"
        if not protocol.endswith("://"):
            protocol = protocol + "://"

        if host is None and not protocol.startswith(
                "inproc") and not interface:
            host = "127.0.0.1"

        services = services or {}
        worker_services = worker_services or {}
        if n_workers is None and threads_per_worker is None:
            if processes:
                n_workers, threads_per_worker = nprocesses_nthreads()
            else:
                n_workers = 1
                threads_per_worker = CPU_COUNT
        if n_workers is None and threads_per_worker is not None:
            n_workers = max(1, CPU_COUNT // threads_per_worker)
        if n_workers and threads_per_worker is None:
            # Overcommit threads per worker, rather than undercommit
            threads_per_worker = max(1, int(math.ceil(CPU_COUNT / n_workers)))
        if n_workers and "memory_limit" not in worker_kwargs:
            worker_kwargs["memory_limit"] = parse_memory_limit(
                "auto", 1, n_workers)

        worker_kwargs.update({
            "nthreads": threads_per_worker,
            "services": worker_services,
            "dashboard_address": worker_dashboard_address,
            "dashboard": worker_dashboard_address is not None,
            "interface": interface,
            "protocol": protocol,
            "security": security,
            "silence_logs": silence_logs,
        })

        scheduler = {
            "cls":
            Scheduler,
            "options":
            toolz.merge(
                dict(
                    host=host,
                    services=services,
                    service_kwargs=service_kwargs,
                    security=security,
                    port=scheduler_port,
                    interface=interface,
                    protocol=protocol,
                    dashboard=dashboard_address is not None,
                    dashboard_address=dashboard_address,
                    blocked_handlers=blocked_handlers,
                ),
                scheduler_kwargs or {},
            ),
        }

        worker = {
            "cls": worker_class or (Worker if not processes else Nanny),
            "options": worker_kwargs,
        }

        workers = {i: worker for i in range(n_workers)}

        super(LocalCluster, self).__init__(
            scheduler=scheduler,
            workers=workers,
            worker=worker,
            loop=loop,
            asynchronous=asynchronous,
            silence_logs=silence_logs,
            security=security,
        )
Beispiel #32
0
def cluster(nworkers=2,
            nanny=False,
            worker_kwargs={},
            active_rpc_timeout=0,
            scheduler_kwargs={}):
    with pristine_loop() as loop:
        with check_active_rpc(loop, active_rpc_timeout):
            if nanny:
                _run_worker = run_nanny
            else:
                _run_worker = run_worker

            # The scheduler queue will receive the scheduler's address
            scheduler_q = mp_context.Queue()

            # Launch scheduler
            scheduler = mp_context.Process(target=run_scheduler,
                                           args=(scheduler_q, nworkers + 1),
                                           kwargs=scheduler_kwargs)
            scheduler.daemon = True
            scheduler.start()

            # Launch workers
            workers = []
            for i in range(nworkers):
                q = mp_context.Queue()
                fn = '_test_worker-%s' % uuid.uuid1()
                kwargs = merge({'ncores': 1, 'local_dir': fn}, worker_kwargs)
                proc = mp_context.Process(target=_run_worker,
                                          args=(q, scheduler_q),
                                          kwargs=kwargs)
                workers.append({'proc': proc, 'queue': q, 'dir': fn})

            for worker in workers:
                worker['proc'].start()
            for worker in workers:
                worker['address'] = worker['queue'].get()

            saddr = scheduler_q.get()

            start = time()
            try:
                with rpc(saddr) as s:
                    while True:
                        ncores = loop.run_sync(s.ncores)
                        if len(ncores) == nworkers:
                            break
                        if time() - start > 5:
                            raise Exception("Timeout on cluster creation")

                yield {'proc': scheduler, 'address': saddr}, workers
            finally:
                logger.debug("Closing out test cluster")

                loop.run_sync(lambda: disconnect_all(
                    [w['address'] for w in workers], timeout=0.5))
                loop.run_sync(lambda: disconnect(saddr, timeout=0.5))

                scheduler.terminate()
                for proc in [w['proc'] for w in workers]:
                    with ignoring(EnvironmentError):
                        proc.terminate()

                scheduler.join(timeout=2)
                for proc in [w['proc'] for w in workers]:
                    proc.join(timeout=2)

                for q in [w['queue'] for w in workers]:
                    q.close()
                for fn in glob('_test_worker-*'):
                    shutil.rmtree(fn)
Beispiel #33
0
 def mkdict(row,
            symbols=self._lookup_most_recent_symbols(sids)):
     return merge(row, symbols[row['sid']])
Beispiel #34
0
def to_textfiles(b,
                 path,
                 name_function=str,
                 compression='infer',
                 encoding=system_encoding,
                 compute=True):
    """ Write bag to disk, one filename per partition, one line per element

    **Paths**: This will create one file for each partition in your bag. You
    can specify the filenames in a variety of ways.

    Use a globstring

    >>> b.to_textfiles('/path/to/data/*.json.gz')  # doctest: +SKIP

    The * will be replaced by the increasing sequence 1, 2, ...

    ::

        /path/to/data/0.json.gz
        /path/to/data/1.json.gz

    Use a globstring and a ``name_function=`` keyword argument.  The
    name_function function should expect an integer and produce a string.

    >>> from datetime import date, timedelta
    >>> def name(i):
    ...     return str(date(2015, 1, 1) + i * timedelta(days=1))

    >>> name(0)
    '2015-01-01'
    >>> name(15)
    '2015-01-16'

    >>> b.to_textfiles('/path/to/data/*.json.gz', name_function=name)  # doctest: +SKIP

    ::

        /path/to/data/2015-01-01.json.gz
        /path/to/data/2015-01-02.json.gz
        ...

    You can also provide an explicit list of paths.

    >>> paths = ['/path/to/data/alice.json.gz', '/path/to/data/bob.json.gz', ...]  # doctest: +SKIP
    >>> b.to_textfiles(paths) # doctest: +SKIP

    **Compression**: Filenames with extensions corresponding to known
    compression algorithms (gz, bz2) will be compressed accordingly.
    """
    if isinstance(path, (str, unicode)):
        if '*' in path:
            paths = [
                path.replace('*', name_function(i))
                for i in range(b.npartitions)
            ]
        else:
            paths = [
                os.path.join(path, '%s.part' % name_function(i))
                for i in range(b.npartitions)
            ]
    elif isinstance(path, (tuple, list, set)):
        assert len(path) == b.npartitions
        paths = path
    else:
        raise ValueError("""Path should be either"
1.  A list of paths -- ['foo.json', 'bar.json', ...]
2.  A directory -- 'foo/
3.  A path with a * in it -- 'foo.*.json'""")

    def get_compression(path, compression=compression):
        if compression == 'infer':
            compression = infer_compression(path)
        return compression

    name = 'to-textfiles-' + uuid.uuid4().hex
    dsk = dict(((name, i), (write, (b.name, i), path, get_compression(path),
                            encoding)) for i, path in enumerate(paths))

    result = Bag(merge(b.dask, dsk), name, b.npartitions)
    if compute:
        result.compute()
    else:
        return result
Beispiel #35
0
    def foldby(self,
               key,
               binop,
               initial=no_default,
               combine=None,
               combine_initial=no_default):
        """ Combined reduction and groupby

        Foldby provides a combined groupby and reduce for efficient parallel
        split-apply-combine tasks.

        The computation

        >>> b.foldby(key, binop, init)                        # doctest: +SKIP

        is equivalent to the following:

        >>> def reduction(group):                               # doctest: +SKIP
        ...     return reduce(binop, group, init)               # doctest: +SKIP

        >>> b.groupby(key).map(lambda (k, v): (k, reduction(v)))# doctest: +SKIP

        But uses minimal communication and so is *much* faster.

        >>> b = from_sequence(range(10))
        >>> iseven = lambda x: x % 2 == 0
        >>> add = lambda x, y: x + y
        >>> dict(b.foldby(iseven, add))                         # doctest: +SKIP
        {True: 20, False: 25}

        **Key Function**

        The key function determines how to group the elements in your bag.
        In the common case where your bag holds dictionaries then the key
        function often gets out one of those elements.

        >>> def key(x):
        ...     return x['name']

        This case is so common that it is special cased, and if you provide a
        key that is not a callable function then dask.bag will turn it into one
        automatically.  The following are equivalent:

        >>> b.foldby(lambda x: x['name'], ...)  # doctest: +SKIP
        >>> b.foldby('name', ...)  # doctest: +SKIP

        **Binops**

        It can be tricky to construct the right binary operators to perform
        analytic queries.  The ``foldby`` method accepts two binary operators,
        ``binop`` and ``combine``.  Binary operators two inputs and output must
        have the same type.

        Binop takes a running total and a new element and produces a new total:

        >>> def binop(total, x):
        ...     return total + x['amount']

        Combine takes two totals and combines them:

        >>> def combine(total1, total2):
        ...     return total1 + total2

        Each of these binary operators may have a default first value for
        total, before any other value is seen.  For addition binary operators
        like above this is often ``0`` or the identity element for your
        operation.

        >>> b.foldby('name', binop, 0, combine, 0)  # doctest: +SKIP

        See Also
        --------

        toolz.reduceby
        pyspark.combineByKey
        """
        token = tokenize(self, key, binop, initial, combine, combine_initial)
        a = 'foldby-a-' + token
        b = 'foldby-b-' + token
        if combine is None:
            combine = binop
        if initial is not no_default:
            dsk = dict(
                ((a, i), (reduceby, key, binop, (self.name, i), initial))
                for i in range(self.npartitions))
        else:
            dsk = dict(((a, i), (reduceby, key, binop, (self.name, i)))
                       for i in range(self.npartitions))

        def combine2(acc, x):
            return combine(acc, x[1])

        if combine_initial is not no_default:
            dsk2 = {
                (b, 0): (dictitems, (reduceby, 0, combine2,
                                     (toolz.concat, (map, dictitems,
                                                     list(dsk.keys()))),
                                     combine_initial))
            }
        else:
            dsk2 = {
                (b, 0): (dictitems, (merge_with, (partial, reduce, combine),
                                     list(dsk.keys())))
            }
        return type(self)(merge(self.dask, dsk, dsk2), b, 1)
Beispiel #36
0
def set_partition(df, index, divisions, compute=False, drop=True, **kwargs):
    """ Group DataFrame by index

    Sets a new index and partitions data along that index according to
    divisions.  Divisions are often found by computing approximate quantiles.
    The function ``set_index`` will do both of these steps.

    Parameters
    ----------
    df: DataFrame/Series
        Data that we want to re-partition
    index: string or Series
        Column to become the new index
    divisions: list
        Values to form new divisions between partitions
    drop: bool, default True
        Whether to delete columns to be used as the new index

    See Also
    --------
    set_index
    shuffle
    partd
    """
    if isinstance(index, Series):
        assert df.divisions == index.divisions
        metadata = df._pd.set_index(index._pd, drop=drop)
    elif np.isscalar(index):
        metadata = df._pd.set_index(index, drop=drop)
    else:
        raise ValueError('index must be Series or scalar, {0} given'.format(
            type(index)))

    token = tokenize(df, index, divisions)
    always_new_token = uuid.uuid1().hex
    import partd

    p = ('zpartd-' + always_new_token, )

    # Get Categories
    catname = 'set-partition--get-categories-old-' + always_new_token
    catname2 = 'set-partition--get-categories-new-' + always_new_token

    dsk1 = {
        catname: (get_categories, df._keys()[0]),
        p:
        (partd.PandasBlocks, (partd.Buffer, (partd.Dict, ), (partd.File, ))),
        catname2: (new_categories, catname,
                   index.name if isinstance(index, Series) else index)
    }

    # Partition data on disk
    name = 'set-partition--partition-' + always_new_token
    if isinstance(index, _Frame):
        dsk2 = dict(
            ((name, i), (_set_partition, part, ind, divisions, p, drop))
            for i, (part, ind) in enumerate(zip(df._keys(), index._keys())))
    else:
        dsk2 = dict(
            ((name, i), (_set_partition, part, index, divisions, p, drop))
            for i, part in enumerate(df._keys()))

    # Barrier
    barrier_token = 'barrier-' + always_new_token
    dsk3 = {barrier_token: (barrier, list(dsk2))}

    if compute:
        dsk = merge(df.dask, dsk1, dsk2, dsk3)
        if isinstance(index, _Frame):
            dsk.update(index.dask)
        p, barrier_token, categories = df._get(dsk,
                                               [p, barrier_token, catname],
                                               **kwargs)
        dsk4 = {catname2: categories}
    else:
        dsk4 = {}

    # Collect groups
    name = 'set-partition--collect-' + token
    if compute and not categories:
        dsk4.update(
            dict(((name, i), (_set_collect, i, p, barrier_token, df.columns))
                 for i in range(len(divisions) - 1)))
    else:
        dsk4.update(
            dict(((name, i), (_categorize, catname2,
                              (_set_collect, i, p, barrier_token, df.columns)))
                 for i in range(len(divisions) - 1)))

    dsk = merge(df.dask, dsk1, dsk2, dsk3, dsk4)

    if isinstance(index, Series):
        dsk.update(index.dask)

    if compute:
        dsk = cull(dsk, list(dsk4.keys()))

    return DataFrame(dsk, name, metadata, divisions)
Beispiel #37
0
 def __repr__(self):
     return "lazy_dict({})".format(
         t.merge(t.valmap(lambda _: "...", self.thunks), self.realized))
Beispiel #38
0
See Also:
    toolz.functoolz.curry
"""

import toolz
import toolz.curried_exceptions
from .functoolz import curry
import inspect


def _nargs(f):
    try:
        return len(inspect.getargspec(f).args)
    except TypeError:
        return None


def _should_curry(f):
    do_curry = set((toolz.map, toolz.filter, toolz.sorted, toolz.reduce))
    return (callable(f) and _nargs(f) and _nargs(f) > 1 or f in do_curry)


_d = dict((name, curry(f) if _should_curry(f) else f)
          for name, f in toolz.__dict__.items() if '__' not in name)

_exceptions = dict((name, curry(f) if callable(f) else f)
                   for name, f in toolz.curried_exceptions.__dict__.items()
                   if '__' not in name)

locals().update(toolz.merge(_d, _exceptions))
Beispiel #39
0
 def extra(self):
     return merge({"prefix": self.prefix}, template_variables)
Beispiel #40
0
def slice_wrap_lists(out_name, in_name, blockdims, index):
    """
    Fancy indexing along blocked array dasks

    Handles index of type list.  Calls slice_slices_and_integers for the rest

    See Also
    --------

    take - handle slicing with lists ("fancy" indexing)
    slice_slices_and_integers - handle slicing with slices and integers
    """
    assert all(
        isinstance(i, (slice, list, Integral, np.ndarray)) for i in index)
    if not len(blockdims) == len(index):
        raise IndexError("Too many indices for array")

    # Do we have more than one list in the index?
    where_list = [
        i for i, ind in enumerate(index)
        if isinstance(ind, np.ndarray) and ind.ndim > 0
    ]
    if len(where_list) > 1:
        raise NotImplementedError("Don't yet support nd fancy indexing")
    # Is the single list an empty list? In this case just treat it as a zero
    # length slice
    if where_list and not index[where_list[0]].size:
        index = list(index)
        index[where_list.pop()] = slice(0, 0, 1)
        index = tuple(index)

    # No lists, hooray! just use slice_slices_and_integers
    if not where_list:
        return slice_slices_and_integers(out_name, in_name, blockdims, index)

    # Replace all lists with full slices  [3, 1, 0] -> slice(None, None, None)
    index_without_list = tuple(
        slice(None, None, None) if isinstance(i, np.ndarray) else i
        for i in index)

    # lists and full slices.  Just use take
    if all(
            isinstance(i, np.ndarray) or i == slice(None, None, None)
            for i in index):
        axis = where_list[0]
        blockdims2, dsk3 = take(out_name,
                                in_name,
                                blockdims,
                                index[where_list[0]],
                                axis=axis)
    # Mixed case. Both slices/integers and lists. slice/integer then take
    else:
        # Do first pass without lists
        tmp = 'slice-' + tokenize((out_name, in_name, blockdims, index))
        dsk, blockdims2 = slice_slices_and_integers(tmp, in_name, blockdims,
                                                    index_without_list)

        # After collapsing some axes due to int indices, adjust axis parameter
        axis = where_list[0]
        axis2 = axis - sum(1 for i, ind in enumerate(index)
                           if i < axis and isinstance(ind, Integral))

        # Do work
        blockdims2, dsk2 = take(out_name,
                                tmp,
                                blockdims2,
                                index[axis],
                                axis=axis2)
        dsk3 = merge(dsk, dsk2)

    return dsk3, blockdims2
Beispiel #41
0
def _get_data(clauses, values, keys):
    result = frappe.db.sql(
        """
            SELECT
                e.bank_name AS bank_name,
                e.bank_ac_no AS bank_ac_no,
                e.employee_name AS employee_name,
                sl.name AS salary_slip,
                sl.start_date AS start_date,
                a.account_number AS account_number
            FROM `tabSalary Slip` AS sl
            LEFT JOIN `tabEmployee` AS e ON e.name = sl.employee
            LEFT JOIN `tabPayroll Entry` AS pe ON pe.name = sl.payroll_entry
            LEFT JOIN `tabAccount` AS a ON a.name = pe.payment_account
            WHERE {clauses}
        """.format(
            clauses=clauses
        ),
        values=values,
        as_dict=1,
    )

    get_amounts = compose(
        partial(groupby, "salary_slip"),
        lambda type: frappe.db.sql(
            """
                SELECT
                    sl.name AS salary_slip,
                    SUM(sd.amount) AS amount
                FROM `tabSalary Detail` AS sd
                LEFT JOIN `tabSalary Slip` AS sl ON sl.name = sd.parent
                WHERE
                    sd.parentfield = %(parentfield)s AND
                    sd.parent IN %(salary_slips)s AND
                    sd.salary_component IN %(components)s
                GROUP BY sl.name
            """,
            values=merge(
                values,
                {
                    "salary_slips": [x.get("salary_slip") for x in result],
                    "parentfield": type,
                },
            ),
            as_dict=1,
        )
        if result
        else {},
    )

    get_amount = compose(
        lambda x: x.get("amount", 0),
        excepts(StopIteration, first, lambda _: {}),
        lambda col, key: col.get(key, []),
    )

    earnings = get_amounts("earnings")
    deductions = get_amounts("deductions")

    def add_remarks(row):
        start_date = row.get("start_date")
        return merge(
            row, {"remarks": "{} SAL".format(start_date.strftime("%b").upper())}
        )

    def set_amounts(row):
        salary_slip = row.get("salary_slip")
        amount = get_amount(earnings, salary_slip) - get_amount(deductions, salary_slip)
        return merge(row, {"amount": amount})

    make_row = compose(partial(pick, keys), add_remarks, set_amounts)
    return with_report_generation_time([make_row(x) for x in result], keys)
Beispiel #42
0
 def set_amounts(row):
     salary_slip = row.get("salary_slip")
     amount = get_amount(earnings, salary_slip) - get_amount(deductions, salary_slip)
     return merge(row, {"amount": amount})
 def fn(row):
     sales_order = row.get("sales_order")
     return merge(row,
                  {"outstanding": outstanding_amounts.get(sales_order, 0)})
Beispiel #44
0
def cluster(nworkers=2, nanny=False, worker_kwargs={}):
    if nanny:
        _run_worker = run_nanny
    else:
        _run_worker = run_worker
    scheduler_q = Queue()
    scheduler = Process(target=run_scheduler, args=(scheduler_q, ))
    scheduler.daemon = True
    scheduler.start()
    sport = scheduler_q.get()

    workers = []
    for i in range(nworkers):
        q = Queue()
        fn = '_test_worker-%s' % uuid.uuid1()
        proc = Process(target=_run_worker,
                       args=(q, sport),
                       kwargs=merge({
                           'ncores': 1,
                           'local_dir': fn
                       }, worker_kwargs))
        workers.append({'proc': proc, 'queue': q, 'dir': fn})

    for worker in workers:
        worker['proc'].start()

    for worker in workers:
        worker['port'] = worker['queue'].get()

    loop = IOLoop()
    s = rpc(ip='127.0.0.1', port=sport)
    start = time()
    try:
        while True:
            ncores = loop.run_sync(s.ncores)
            if len(ncores) == nworkers:
                break
            if time() - start > 5:
                raise Exception("Timeout on cluster creation")

        yield {'proc': scheduler, 'port': sport}, workers
    finally:
        logger.debug("Closing out test cluster")
        with ignoring(socket.error, TimeoutError, StreamClosedError):
            loop.run_sync(lambda: disconnect('127.0.0.1', sport), timeout=0.5)
        scheduler.terminate()
        scheduler.join(timeout=2)

        for port in [w['port'] for w in workers]:
            with ignoring(socket.error, TimeoutError, StreamClosedError):
                loop.run_sync(lambda: disconnect('127.0.0.1', port),
                              timeout=0.5)
        for proc in [w['proc'] for w in workers]:
            with ignoring(Exception):
                proc.terminate()
                proc.join(timeout=2)
        for q in [w['queue'] for w in workers]:
            q.close()
        for fn in glob('_test_worker-*'):
            shutil.rmtree(fn)
        loop.close(all_fds=True)
Beispiel #45
0
    def test_bundle(self):
        url_map = merge(
            {
                format_wiki_url(
                    self.api_key,
                    symbol,
                    self.start_date,
                    self.end_date,
                ): test_resource_path('quandl_samples', symbol + '.csv.gz')
                for symbol in self.symbols
            },
            {
                format_metadata_url(self.api_key, n): test_resource_path(
                    'quandl_samples',
                    'metadata-%d.csv.gz' % n,
                )
                for n in (1, 2)
            },
        )
        zipline_root = self.enter_instance_context(tmp_dir()).path
        environ = {
            'ZIPLINE_ROOT': zipline_root,
            'QUANDL_API_KEY': self.api_key,
        }

        with patch_read_csv(url_map, strict=True):
            ingest('quandl', environ=environ)

        bundle = load('quandl', environ=environ)
        sids = 0, 1, 2, 3
        assert_equal(set(bundle.asset_finder.sids), set(sids))

        for equity in bundle.asset_finder.retrieve_all(sids):
            assert_equal(equity.start_date, self.asset_start, msg=equity)
            assert_equal(equity.end_date, self.asset_end, msg=equity)

        sessions = self.calendar.all_sessions
        actual = bundle.equity_daily_bar_reader.load_raw_arrays(
            self.columns,
            sessions[sessions.get_loc(self.asset_start, 'bfill')],
            sessions[sessions.get_loc(self.asset_end, 'ffill')],
            sids,
        )
        expected_pricing, expected_adjustments = self._expected_data(
            bundle.asset_finder, )
        assert_equal(actual, expected_pricing, array_decimal=2)

        adjustments_for_cols = bundle.adjustment_reader.load_adjustments(
            self.columns,
            sessions,
            pd.Index(sids),
        )

        for column, adjustments, expected in zip(self.columns,
                                                 adjustments_for_cols,
                                                 expected_adjustments):
            assert_equal(
                adjustments,
                expected,
                msg=column,
            )
Beispiel #46
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])
    topical_transformer = topicalq_transformer(
        config['source_topic_vocab_size'], config['topical_embedding_dim'],
        config['enc_nhids'], config['topical_word_num'], config['batch_size'])
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      topicWord_size=config['trg_topic_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      topical_dim=config['topical_embedding_dim'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask');
        sampling_input = tensor.lmatrix('input')
        source_topical_word = tensor.lmatrix('source_topical')
        source_topical_mask = tensor.matrix('source_topical_mask')

        topic_embedding = topical_transformer.apply(source_topical_word)

        # Get training and development set streams
        tr_stream = get_tr_stream_with_topic_target(**config)
        #dev_stream = get_dev_tr_stream_with_topic_target(**config)

        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = representations[0, :,
                                            (representations.shape[2] / 2):]
        cost = decoder.cost(representations, source_sentence_mask,
                            tw_representation, source_topical_mask,
                            target_sentence, target_sentence_mask,
                            target_topic_sentence,
                            target_topic_binary_sentence, topic_embedding,
                            content_embedding)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'

        cg = ComputationGraph(cost)
        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (perplexity),
                                  on_unused_input='ignore')

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        topical_transformer.weights_init = IsotropicGaussian(
            config['weight_scale'])
        topical_transformer.biases_init = Constant(0)
        topical_transformer.push_allocation_config()
        #don't know whether the initialize is for
        topical_transformer.look_up.weights_init = Orthogonal()
        topical_transformer.transformer.weights_init = Orthogonal()
        topical_transformer.initialize()
        word_topical_embedding = cPickle.load(
            open(config['topical_embeddings'], 'rb'))
        np_word_topical_embedding = numpy.array(word_topical_embedding,
                                                dtype='float32')
        topical_transformer.look_up.W.set_value(np_word_topical_embedding)
        topical_transformer.look_up.W.tag.role = []

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [
                x for x in cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(cg, enc_params + dec_params,
                             config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # # Set up beam search and sampling computation graphs if necessary
        # if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        #     logger.info("Building sampling model")
        #     sampling_representation = encoder.apply(
        #         sampling_input, tensor.ones(sampling_input.shape))
        #     generated = decoder.generate(
        #         sampling_input, sampling_representation)
        #     search_model = Model(generated)
        #     _, samples = VariableFilter(
        #         bricks=[decoder.sequence_generator], name="outputs")(
        #             ComputationGraph(generated[1]))
        #
        # # Add sampling
        # if config['hook_samples'] >= 1:
        #     logger.info("Building sampler")
        #     extensions.append(
        #         Sampler(model=search_model, data_stream=tr_stream,
        #                 model_name=config['model_name'],
        #                 hook_samples=config['hook_samples'],
        #                 every_n_batches=config['sampling_freq'],
        #                 src_vocab_size=config['src_vocab_size']))
        #
        # # Add early stopping based on bleu
        # if False:
        #     logger.info("Building bleu validator")
        #     extensions.append(
        #         BleuValidator(sampling_input, samples=samples, config=config,
        #                       model=search_model, data_stream=dev_stream,
        #                       normalize=config['normalized_bleu'],
        #                       every_n_batches=config['bleu_val_freq'],
        #                       n_best=3,
        #                       track_n_models=6))
        #
        # logger.info("Building perplexity validator")
        # extensions.append(
        #         pplValidation( config=config,
        #                 model=costs_computer, data_stream=dev_stream,
        #                 model_name=config['model_name'],
        #                 every_n_batches=config['sampling_freq']))

        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En',
                     channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        Scale(initial_learning_rate),
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]),
                                    on_unused_sources='ignore')

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(model=training_model,
                             algorithm=algorithm,
                             data_stream=tr_stream,
                             extensions=extensions)

        # Train!
        main_loop.run()

    elif mode == 'translate':

        logger.info('Creating theano variables')
        sampling_input = tensor.lmatrix('source')
        source_topical_word = tensor.lmatrix('source_topical')
        tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap')
        tw_vocab_overlap_matrix = cPickle.load(
            open(config['tw_vocab_overlap'], 'rb'))
        tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix,
                                              dtype='int32')
        #tw_vocab_overlap=shared(tw_vocab_overlap_matrix);

        topic_embedding = topical_transformer.apply(source_topical_word)

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(cPickle.load(
            open(config['trg_vocab'], 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_eos_idx,
                                           unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}

        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        topic_embedding = topical_transformer.apply(source_topical_word)
        tw_representation = topical_transformer.look_up.apply(
            source_topical_word.T)
        content_embedding = sampling_representation[0, :, (
            sampling_representation.shape[2] / 2):]
        generated = decoder.generate(sampling_input,
                                     sampling_representation,
                                     tw_representation,
                                     topical_embedding=topic_embedding,
                                     content_embedding=content_embedding)

        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_with_topicalq(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw.split()
            seq = sutils._oov_to_unk(line[0], config['src_vocab_size'],
                                     unk_idx)
            seq1 = line[1]
            input_topical = numpy.tile(seq1, (config['beam_size'], 1))
            input_ = numpy.tile(seq, (config['beam_size'], 1))

            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix},
                    tw_vocab_overlap=tw_vocab_overlap_matrix,
                    max_length=3*len(seq), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            best = numpy.argsort(costs)[0]
            try:
                total_cost += costs[best]
                trans_out = trans[best]
                weight = weights[best][:, :len(trans_in)]
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i + 1))
                trans_out = '<UNK>'

            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        # ap = afterprocesser(config)
        # ap.main()

    elif mode == 'score':
        logger.info('Creating theano variables')
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        target_topic_sentence = tensor.lmatrix('target_topic')
        target_topic_binary_sentence = tensor.lmatrix('target_binary_topic')
        source_topical_word = tensor.lmatrix('source_topical')

        topic_embedding = topical_transformer.apply(source_topical_word)
        # Get cost of the model
        representations = encoder.apply(source_sentence, source_sentence_mask)
        costs = decoder.cost(representations, source_sentence_mask,
                             target_sentence, target_sentence_mask,
                             target_topic_sentence,
                             target_topic_binary_sentence, topic_embedding)

        config['batch_size'] = 1
        config['sort_k_batches'] = 1
        # Get test set stream
        test_stream = get_tr_stream_with_topic_target(**config)

        logger.info("Building sampling model")

        logger.info("Loading the model..")
        model = Model(costs)
        loader = LoadNMT(config['validation_load'])
        loader.set_model_parameters(model, loader.load_parameters_default())

        costs_computer = function([
            target_sentence, target_sentence_mask, source_sentence,
            source_sentence_mask, source_topical_word, target_topic_sentence,
            target_topic_binary_sentence
        ], (costs),
                                  on_unused_input='ignore')

        iterator = test_stream.get_epoch_iterator()

        scores = []
        att_weights = []
        for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb,
                tb_mask) in enumerate(iterator):
            costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb])
            cost = costs.sum()
            print(i, cost)
            scores.append(cost)

        print(sum(scores) / 10007)
Beispiel #47
0
 def load_adjusted_array(self, columns, dates, sids, mask):
     n, p = self.split_next_and_previous_event_columns(columns)
     return merge(
         self.load_next_events(n, dates, sids, mask),
         self.load_previous_events(p, dates, sids, mask),
     )
Beispiel #48
0
def slice_wrap_lists(out_name, in_name, blockdims, index):
    """
    Fancy indexing along blocked array dasks

    Handles index of type list.  Calls slice_slices_and_integers for the rest

    See Also
    --------

    take - handle slicing with lists ("fancy" indexing)
    slice_slices_and_integers - handle slicing with slices and integers
    """
    shape = tuple(map(sum, blockdims))
    assert all(isinstance(i, (slice, list, int, long)) for i in index)
    assert len(blockdims) == len(index)
    for bd, i in zip(blockdims, index):
        check_index(i, sum(bd))

    # Change indices like -1 to 9
    index2 = posify_index(shape, index)

    # Do we have more than one list in the index?
    where_list = [i for i, ind in enumerate(index) if isinstance(ind, list)]
    if len(where_list) > 1:
        raise NotImplementedError("Don't yet support nd fancy indexing")

    # Replace all lists with full slices  [3, 1, 0] -> slice(None, None, None)
    index_without_list = tuple(
        slice(None, None, None) if isinstance(i, list) else i for i in index2)

    # No lists, hooray! just use slice_slices_and_integers
    if index2 == index_without_list:
        return slice_slices_and_integers(out_name, in_name, blockdims, index2)

    # lists and full slices.  Just use take
    if all(
            isinstance(i, list) or i == slice(None, None, None)
            for i in index2):
        axis = where_list[0]
        blockdims2, dsk3 = take(out_name,
                                in_name,
                                blockdims,
                                index2[where_list[0]],
                                axis=axis)
    # Mixed case. Both slices/integers and lists. slice/integer then take
    else:
        # Do first pass without lists
        tmp = 'slice-' + tokenize((out_name, in_name, blockdims, index))
        dsk, blockdims2 = slice_slices_and_integers(tmp, in_name, blockdims,
                                                    index_without_list)

        # After collapsing some axes due to int indices, adjust axis parameter
        axis = where_list[0]
        axis2 = axis - sum(1 for i, ind in enumerate(index2)
                           if i < axis and isinstance(ind, (int, long)))

        # Do work
        blockdims2, dsk2 = take(out_name,
                                tmp,
                                blockdims2,
                                index2[axis],
                                axis=axis2)
        dsk3 = merge(dsk, dsk2)

    return dsk3, blockdims2
Beispiel #49
0
 def apply(self, func):
     name = 'apply-{0}-{1}'.format(funcname(func), tokenize(self, func))
     dsk = {name: (func, self.key)}
     return Item(merge(self.dask, dsk), name)
Beispiel #50
0
def gen_cluster(
    nthreads=[("127.0.0.1", 1), ("127.0.0.1", 2)],
    ncores=None,
    scheduler="127.0.0.1",
    timeout=10,
    security=None,
    Worker=Worker,
    client=False,
    scheduler_kwargs={},
    worker_kwargs={},
    client_kwargs={},
    active_rpc_timeout=1,
    config={},
    check_new_threads=True,
):
    from distributed import Client

    """ Coroutine test with small cluster

    @gen_cluster()
    def test_foo(scheduler, worker1, worker2):
        yield ...  # use tornado coroutines

    See also:
        start
        end
    """
    if ncores is not None:
        warnings.warn("ncores= has moved to nthreads=")
        nthreads = ncores

    worker_kwargs = merge(
        {"memory_limit": TOTAL_MEMORY, "death_timeout": 5}, worker_kwargs
    )

    def _(func):
        if not iscoroutinefunction(func):
            func = gen.coroutine(func)

        def test_func():
            result = None
            workers = []
            with clean(threads=check_new_threads, timeout=active_rpc_timeout) as loop:

                @gen.coroutine
                def coro():
                    with dask.config.set(config):
                        s = False
                        for i in range(5):
                            try:
                                s, ws = yield start_cluster(
                                    nthreads,
                                    scheduler,
                                    loop,
                                    security=security,
                                    Worker=Worker,
                                    scheduler_kwargs=scheduler_kwargs,
                                    worker_kwargs=worker_kwargs,
                                )
                            except Exception as e:
                                logger.error(
                                    "Failed to start gen_cluster, retrying",
                                    exc_info=True,
                                )
                            else:
                                workers[:] = ws
                                args = [s] + workers
                                break
                        if s is False:
                            raise Exception("Could not start cluster")
                        if client:
                            c = yield Client(
                                s.address,
                                loop=loop,
                                security=security,
                                asynchronous=True,
                                **client_kwargs
                            )
                            args = [c] + args
                        try:
                            future = func(*args)
                            if timeout:
                                future = gen.with_timeout(
                                    timedelta(seconds=timeout), future
                                )
                            result = yield future
                            if s.validate:
                                s.validate_state()
                        finally:
                            if client and c.status not in ("closing", "closed"):
                                yield c._close(fast=s.status == "closed")
                            yield end_cluster(s, workers)
                            yield gen.with_timeout(
                                timedelta(seconds=1), cleanup_global_workers()
                            )

                        try:
                            c = yield default_client()
                        except ValueError:
                            pass
                        else:
                            yield c._close(fast=True)

                        for i in range(5):
                            if all(c.closed() for c in Comm._instances):
                                break
                            else:
                                yield gen.sleep(0.05)
                        else:
                            L = [c for c in Comm._instances if not c.closed()]
                            Comm._instances.clear()
                            # raise ValueError("Unclosed Comms", L)
                            print("Unclosed Comms", L)

                        raise gen.Return(result)

                result = loop.run_sync(
                    coro, timeout=timeout * 2 if timeout else timeout
                )

            for w in workers:
                if getattr(w, "data", None):
                    try:
                        w.data.clear()
                    except EnvironmentError:
                        # zict backends can fail if their storage directory
                        # was already removed
                        pass
                    del w.data

            return result

        return test_func

    return _
Beispiel #51
0
 def get(self):
     with log_errors():
         self.render("workers.html",
                     title="Workers",
                     scheduler=self.server,
                     **toolz.merge(self.server.__dict__, ns, self.extra))
Beispiel #52
0
def cluster(
    nworkers=2, nanny=False, worker_kwargs={}, active_rpc_timeout=1, scheduler_kwargs={}
):
    ws = weakref.WeakSet()
    enable_proctitle_on_children()

    with clean(timeout=active_rpc_timeout, threads=False) as loop:
        if nanny:
            _run_worker = run_nanny
        else:
            _run_worker = run_worker

        # The scheduler queue will receive the scheduler's address
        scheduler_q = mp_context.Queue()

        # Launch scheduler
        scheduler = mp_context.Process(
            name="Dask cluster test: Scheduler",
            target=run_scheduler,
            args=(scheduler_q, nworkers + 1),
            kwargs=scheduler_kwargs,
        )
        ws.add(scheduler)
        scheduler.daemon = True
        scheduler.start()

        # Launch workers
        workers = []
        for i in range(nworkers):
            q = mp_context.Queue()
            fn = "_test_worker-%s" % uuid.uuid4()
            kwargs = merge(
                {"nthreads": 1, "local_dir": fn, "memory_limit": TOTAL_MEMORY},
                worker_kwargs,
            )
            proc = mp_context.Process(
                name="Dask cluster test: Worker",
                target=_run_worker,
                args=(q, scheduler_q),
                kwargs=kwargs,
            )
            ws.add(proc)
            workers.append({"proc": proc, "queue": q, "dir": fn})

        for worker in workers:
            worker["proc"].start()
        try:
            for worker in workers:
                worker["address"] = worker["queue"].get(timeout=5)
        except Empty:
            raise pytest.xfail.Exception("Worker failed to start in test")

        saddr = scheduler_q.get()

        start = time()
        try:
            try:
                security = scheduler_kwargs["security"]
                rpc_kwargs = {"connection_args": security.get_connection_args("client")}
            except KeyError:
                rpc_kwargs = {}

            with rpc(saddr, **rpc_kwargs) as s:
                while True:
                    nthreads = loop.run_sync(s.ncores)
                    if len(nthreads) == nworkers:
                        break
                    if time() - start > 5:
                        raise Exception("Timeout on cluster creation")

            # avoid sending processes down to function
            yield {"address": saddr}, [
                {"address": w["address"], "proc": weakref.ref(w["proc"])}
                for w in workers
            ]
        finally:
            logger.debug("Closing out test cluster")

            loop.run_sync(
                lambda: disconnect_all(
                    [w["address"] for w in workers], timeout=0.5, rpc_kwargs=rpc_kwargs
                )
            )
            loop.run_sync(lambda: disconnect(saddr, timeout=0.5, rpc_kwargs=rpc_kwargs))

            scheduler.terminate()
            scheduler_q.close()
            scheduler_q._reader.close()
            scheduler_q._writer.close()

            for w in workers:
                w["proc"].terminate()
                w["queue"].close()
                w["queue"]._reader.close()
                w["queue"]._writer.close()

            scheduler.join(2)
            del scheduler
            for proc in [w["proc"] for w in workers]:
                proc.join(timeout=2)

            with ignoring(UnboundLocalError):
                del worker, w, proc
            del workers[:]

            for fn in glob("_test_worker-*"):
                with ignoring(OSError):
                    shutil.rmtree(fn)

        try:
            client = default_client()
        except ValueError:
            pass
        else:
            client.close()

    start = time()
    while any(proc.is_alive() for proc in ws):
        text = str(list(ws))
        sleep(0.2)
        assert time() < start + 5, ("Workers still around after five seconds", text)
Beispiel #53
0

def _nargs(f):
    try:
        return len(inspect.getargspec(f).args)
    except TypeError:
        return 0


def _should_curry(f):
    do_curry = frozenset((toolz.map, toolz.filter, toolz.sorted, toolz.reduce))
    return (callable(f) and _nargs(f) > 1 or f in do_curry)


def _curry_namespace(ns):
    return dict((name, toolz.curry(f) if _should_curry(f) else f)
                for name, f in ns.items() if '__' not in name)


locals().update(
    toolz.merge(
        _curry_namespace(vars(toolz)),
        _curry_namespace(vars(exceptions)),
    ))

# Clean up the namespace.
del _nargs
del _should_curry
del exceptions
del toolz
Beispiel #54
0
def compile(fn_graph, get=dask.get):
    fn_param_info = t.valmap(_param_info, fn_graph)
    global_param_info = {}
    for param_info in fn_param_info.values():
        for kw, value in param_info.items():
            if kw in global_param_info and global_param_info[kw] != value:
                global_param_info[kw] = _AMBIGUOUS
            else:
                global_param_info[kw] = value
    computed_args = set(fn_graph.keys())
    required_params, defaulted = u.split_keys_by_val(_is_required,
                                                     global_param_info)
    required_params = required_params - computed_args

    all_params = required_params.union(defaulted)
    default_args = u.select_keys(global_param_info, defaulted)

    def to_task(res_key, param_info):
        fn = fn_graph[res_key]
        dask_args = tuple(param_info.keys())
        if _is_curry_func(fn):
            # wrap the fn but persist the args, and kargs on it
            args = tuple([default_args.get(p, p) for p in param_info.keys()])
            set_varargs, set_kargs = _partial_inputs(fn)

            def wrapper(*args):
                kwargs = t.merge(set_kargs, dict(zip(param_info.keys(), args)))
                return fn(*set_varargs, **kwargs)

            wrapper.__name__ = _func_name(fn)
            # we maintain the curry/partial func info
            wrapper.func = _func_name(fn)
            wrapper.keywords = fn.keywords
            wrapper.args = fn.args
            return (wrapper, ) + dask_args

        return (fn, ) + dask_args

    base_dask = {
        k: to_task(k, param_info)
        for k, param_info in fn_param_info.items()
    }

    outputs = list(fn_graph.keys())

    def funk(get=get, **kargs):
        param_keys = set(kargs.keys())
        missing_keys = required_params - param_keys
        if missing_keys:
            raise TypeError(
                'missing these keyword arguments: {}'.format(missing_keys))
        extra_keys = param_keys - all_params
        if extra_keys:
            raise TypeError(
                'unexpected keyword arguments passed in: {}'.format(
                    extra_keys))

        dsk = t.merge(base_dask, default_args, kargs)
        res = get(dsk, outputs)
        return dict(zip(outputs, res))

    funk.required = required_params
    funk.defaults = default_args
    funk.base_dask = base_dask
    funk.full_dask = t.merge(base_dask,
                             dict(zip(all_params, repeat(_UNSPECIFIED))))

    # TODO: use bolton's FunctionBuilder to set kargs so it has a useful function signature
    return funk
Beispiel #55
0
def unpack_as_lists_of_keys(*args):
    parts, dsks = zip(*map(_unpack_keys_dask, args))
    if len(set(map(len, parts))) != 1:
        raise ValueError("inputs must all have the same number "
                         "of partitions along the first dimension")
    return tuple(parts) + (merge(dsks), )
Beispiel #56
0
 def wrapper(*args):
     kwargs = t.merge(set_kargs, dict(zip(param_info.keys(), args)))
     return fn(*set_varargs, **kwargs)
def _set_period_columns(sales, periods):
    def groupby_filter(sl):
        def fn(p):
            return p.get("start_date") <= sl.get("posting_date") <= p.get("end_date")

        return fn

    groupby_fn = compose(
        partial(get, "key", default=None),
        excepts(StopIteration, first, lambda __: {}),
        partial(flip, filter, periods),
        groupby_filter,
    )

    sales_grouped = groupby(groupby_fn, sales)

    def summer(key):
        return compose(sum, partial(pluck, key))

    def seg_filter(x):
        return lambda sale: sale.get("item_code") == x

    def seger(sum_fn, x):
        return compose(
            sum_fn,
            partial(flip, filter, get(x.get("key"), sales_grouped, [])),
            seg_filter,
        )

    def total_fn(sum_fn):
        return compose(sum_fn, partial(flip, filter, sales), seg_filter)

    summer_qty = summer("qty")
    summer_amount = summer("amount")

    segregator_fns = [
        merge(
            x,
            {
                "seger_qty": seger(summer_qty, x),
                "seger_amount": seger(summer_amount, x),
            },
        )
        for x in periods
    ]

    def seg_reducer(item_code):
        def fn(a, p):
            key = get("key", p, None)
            seger_qty = get("seger_qty", p, lambda __: None)
            seger_amount = get("seger_amount", p, lambda __: None)
            return merge(
                a,
                {
                    key: seger_qty(item_code),
                    "{}_amount".format(key): seger_amount(item_code),
                },
            )

        return fn

    def fn(item):
        item_code = item.get("item_code")
        total_qty = total_fn(summer_qty)
        total_amount = total_fn(summer_amount)
        return merge(
            item,
            reduce(seg_reducer(item_code), segregator_fns, {}),
            {
                "total_qty": total_qty(item_code),
                "total_amount": total_amount(item_code),
            },
        )

    return fn
Beispiel #58
0
def execute_aggregation_dataframe(op, data, scope=None, **kwargs):
    assert op.metrics, 'no metrics found during aggregation execution'

    if op.sort_keys:
        raise NotImplementedError(
            'sorting on aggregations not yet implemented'
        )

    predicates = op.predicates
    if predicates:
        predicate = functools.reduce(
            operator.and_,
            (execute(p, scope=scope, **kwargs) for p in predicates),
        )
        data = data.loc[predicate]

    columns = {}

    if op.by:
        grouping_key_pairs = list(
            zip(op.by, map(operator.methodcaller('op'), op.by))
        )
        grouping_keys = [
            by_op.name
            if isinstance(by_op, ops.TableColumn)
            else execute(by, scope=scope, **kwargs).rename(by.get_name())
            for by, by_op in grouping_key_pairs
        ]
        columns.update(
            (by_op.name, by.get_name())
            for by, by_op in grouping_key_pairs
            if hasattr(by_op, 'name')
        )
        source = data.groupby(grouping_keys)
    else:
        source = data

    new_scope = toolz.merge(scope, {op.table.op(): source})
    pieces = [
        pd.Series(
            execute(metric, scope=new_scope, **kwargs), name=metric.get_name()
        )
        for metric in op.metrics
    ]

    # group by always needs a reset to get the grouping key back as a column
    result = pd.concat(pieces, axis=1).reset_index()
    result.columns = [columns.get(c, c) for c in result.columns]

    if op.having:
        # .having(...) is only accessible on groupby, so this should never
        # raise
        if not op.by:
            raise ValueError(
                'Filtering out aggregation values is not allowed without at '
                'least one grouping key'
            )

        # TODO(phillipc): Don't recompute identical subexpressions
        predicate = functools.reduce(
            operator.and_,
            (
                execute(having, scope=new_scope, **kwargs)
                for having in op.having
            ),
        )
        assert len(predicate) == len(
            result
        ), 'length of predicate does not match length of DataFrame'
        result = result.loc[predicate.values]
    return result
Beispiel #59
0
 def add_remarks(row):
     start_date = row.get("start_date")
     return merge(
         row, {"remarks": "{} SAL".format(start_date.strftime("%b").upper())}
     )
 def fn(row):
     docname = row.get("sales_order")
     return merge(row, {"Draft": row.get("creation")},
                  comments.get(docname, {}))