Python Client.persist Examples

Programming Language: Python

Namespace/Package Name: distributed

Class/Type: Client

Method/Function: persist

Examples at hotexamples.com: 14

Python Client.persist - 14 examples found. These are the top rated real world Python examples of distributed.Client.persist extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Client(30)

gather(30)

map(30)

close(30)

persist(13)

compute(12)

ncores(10)

_start(10)

_shutdown(9)

get_dataset(6)

publish_dataset(5)

get(4)

nthreads(3)

current(3)

_gather(3)

cancel(3)

_publish_dataset(3)

channel(2)

list_datasets(2)

_scatter(2)

_repr_html_(2)

_get_dataset(2)

rebalance(2)

get_worker_logs(1)

has_what(1)

_restart(1)

Example #1

Show file

def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key
            for f in result.dask.values()
            } == {f.key
                  for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()

Example #2

Show file

File: cli.py Project: www3838438/conda-gitlab-ci

def build_cli(args=None):
    if not args:
        args = parse_args()
    else:
        args = parse_args(args)
    filter_dirty = any(args.packages) or not args._all

    outputs = get_dask_outputs(args.path,
                               packages=args.packages,
                               filter_dirty=filter_dirty,
                               git_rev=args.git_rev,
                               stop_rev=args.stop_rev,
                               steps=args.steps,
                               max_downstream=args.max_downstream,
                               visualize=args.visualize,
                               test=args.test)

    if args.visualize:
        # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout)
        # graphviz_graph = nx.draw_graphviz(graph, 'dot')
        # graphviz_graph.draw(args.visualize)
        visualize(*outputs,
                  filename=args.visualize)  # create neat looking graph.
    else:
        # many threads, because this is just the dispatch.  Takes very little compute.
        # Only waiting for build complete.
        cluster = LocalCluster(n_workers=1,
                               threads_per_worker=args.threads,
                               nanny=False)
        client = Client(cluster)

        futures = client.persist(outputs)
        progress(futures)

Example #3

Show file

File: scheduler.py Project: jsignell/dask-benchmarks

class SchedulerComputeDepsInMemory(object):

    def setup(self):
        self.client = Client()

        # Generate 10 indepdent tasks
        x = [delayed(random.random)() for _ in range(10)]
        # Generate lots of interrelated dependent tasks
        n = 200
        for _ in range(10, n):
            random_subset = [random.choice(x) for _ in range(5)]
            random_max = delayed(max)(random_subset)
            x.append(random_max)

        # Persist tasks into distributed memory and wait to finish
        y = self.client.persist(x)
        wait(y)

        self.x = x

    def teardown(self):
        self.client.close()

    def time_compute_deps_already_in_memory(self):
        """
        Measure compute time when dependent tasks are already in memory.
        xref https://github.com/dask/distributed/pull/3293
        """
        compute(*self.x, scheduler=self.client)

Example #4

Show file

File: test_publish.py Project: dask/distributed

def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()

Example #5

Show file

def test_persist(dsf):
    correct = dsf.compute().todense()
    client = Client()
    persisted = client.persist(dsf)

    res = persisted.compute().todense()

    pdt.assert_frame_equal(res, correct)

Example #6

Show file

File: test_execute.py Project: www3838438/conda-gitlab-ci

def dask_evaluate(outputs):
    utils.port_increment += 2
    scheduler_port = 8786 + utils.port_increment
    diagnostics_port = 8787 + utils.port_increment

    cluster = LocalCluster(n_workers=1, threads_per_worker=10, nanny=False,
                           scheduler_port=scheduler_port, diagnostics_port=diagnostics_port)
    client = Client(cluster)
    futures = client.persist(outputs)
    return client.gather(futures)

Example #7

Show file

File: dataframes.py Project: ntlind/python_utilities

def distribute_dask_df(dask_df):
    """
    Distribute a dask dataframe over a client that's accessible via
    the global DASK_CLIENT
    """
    from distributed import Client

    global DASK_CLIENT
    DASK_CLIENT = Client()

    dask_df = DASK_CLIENT.persist(dask_df)

    return dask_df

Example #8

Show file

File: prediction.py Project: thenaturalist/Pipeline-Sample-Code

        self.cluster = Cluster(
            [self.MORPHL_SERVER_IP_ADDRESS], auth_provider=self.auth_provider)
        self.session = self.cluster.connect(self.MORPHL_CASSANDRA_KEYSPACE)

        self.prep_stmt = self.session.prepare(self.CQL_STMT)

    def save_prediction(self, client_id, prediction):
        bind_list = [client_id, prediction]
        self.session.execute(self.prep_stmt, bind_list, timeout=self.CASS_REQ_TIMEOUT)

def batch_inference_on_partition(partition_df):
    churn_model_file = f'/opt/models/{DAY_AS_STR}_{UNIQUE_HASH}_churn_model.h5'
    churn_model = load_model(churn_model_file)
    prediction = churn_model.predict(partition_df.drop(['client_id'], axis=1))[0][0]
    return prediction

def persist_partition(partition_df):
    def persist_one_prediction(series_obj):
        cassandra.save_prediction(series_obj.client_id, series_obj.prediction)
    cassandra = Cassandra()
    partition_df.apply(persist_one_prediction, axis=1)
    return 0

if __name__ == '__main__':
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    dask_df.client_id.count().compute()
    dask_df['prediction'] = dask_df.map_partitions(batch_inference_on_partition, meta=('prediction', float))
    dask_df['token'] = dask_df.map_partitions(persist_partition, meta=('token', int))
    dask_df.token.compute()

Example #9

Show file

File: test_crossphase.py Project: rmchurch/delta

# Encoding: UTF-8 -*-

import numpy as np
from distributed import Client, progress
import dask.array as da
dask_client = Client(
    scheduler_file="/global/cscratch1/sd/rkube/scheduler.json")

with np.load("../dask_fft_data_s0000.npz") as df:
    num_channels, num_fft = df["fft_data"].shape
    print(num_channels, num_fft)

    fft_data = da.from_array(df["fft_data"], chunks=(1, num_fft))
    dask_client.persist(fft_data)

# Calculate the crosspower using the array interface
res1 = (fft_data[:2, :] * fft_data[-2:, :].conj()).mean(axis=1)
print("type res1 = ", type(res1))
res2 = da.arctan2(res1.real, res1.imag).real
print("type res2 = ", type(res2))
print("result res2 = ", res2.compute())


# Calculate the crosspower using the distributed interface
def cross_phase(ft_data, ch1, ch2):

    _tmp1 = (ft_data[ch1, :] * ft_data[ch2, :].conj()).mean().compute()
    print("** crosspower: type(tmp1) =", type(_tmp1))
    _tmp2 = np.arctan2(_tmp1.real, _tmp1.imag).real
    #_tmp2 = _tmp1.real + _tmp1.imag

Example #10

Show file

#
# - Display head of the dataframe
# - Display number of rows of this dataframe.
# - Compute the total number of passengers.
# - Count occurrences in the payment_type column both for the full dataset, and filtered by zero tip (tip_amount == 0).
# - Create a new column, tip_fraction
# - Plot the average of the new column tip_fraction grouped by day of week.
# - Plot the average of the new column tip_fraction grouped by hour of day.
#
# [Dask dataframe documentation](http://docs.dask.org/en/latest/dataframe.html)
#

# +
import dask.dataframe as dd
from distributed import Client, progress

c = Client('127.0.0.1:8786')
nyc2014 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2014/yellow*.csv',
parse_dates=['pickup_datetime', 'dropoff_datetime'],
skipinitialspace=True)

nyc2015 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2015/yellow*.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
nyc2014, nyc2015 = c.persist([nyc2014, nyc2015])

progress(nyc2014, nyc2015)
# -

Example #11

Show file

File: run_arboretum_with_meta.py Project: wulabupenn/arboreto

    client = Client(LocalCluster())

    print(client._repr_html_())

    network_graph, meta_graph = create_graph(
        expression_matrix.as_matrix(),
        gene_names,
        tf_names,
        "GBM",
        SGBM_KWARGS,
        client=client,  # broadcast!
        early_stop_window_length=25,
        include_meta=True)

    # Good!
    a, b = client.persist([network_graph, meta_graph])
    network_df = a.compute(sync=True)
    meta_df = b.compute(sync=True)

    # Bad!
    # network_df, meta_df = client.compute([network_graph, meta_graph], sync=True)

    if client:
        client.close()

    network_df.to_csv(net_out_path, sep='\t', index=False)
    meta_df.to_csv(meta_out_path, sep='\t', index=False)

    end_time = time.time()

    print('wall time: {} seconds'.format(end_time - start_time))

Example #12

Show file

File: ga_chp_bq_model_generator.py Project: thenaturalist/MorphL-Community-Edition

def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model()

Example #13

Show file

        raise ValueError("n_files = {0} > n_workers {1}".format(
            no_of_files, n_workers))

    # start script
    h5filelist = sorted(utils.getFileList(data_dir, 'h5'))
    # print(h5filelist[:no_of_files])
    file_size = utils.getFileSizeInGB(h5filelist[:no_of_files])
    print("File size to read in is {0:.0f} GB".format(file_size))

    cluster = LocalCluster(n_workers=n_workers, threads_per_worker=n_threads)
    client = Client(cluster)
    print("Using {0} workers and {1} thread(s) per worker".format(
        n_workers, n_threads))
    df = ddf.read_hdf(h5filelist[:no_of_files], 'csv', columns=columns)
    start_time = time.time()
    df = client.persist(df)
    print(df.index.compute())
    end_time = time.time()
    read_time = end_time - start_time
    # print("lines read in =", df.shape)
    print("Reading in {0} files in {1:.0f}s ".format(no_of_files, read_time))

    df_size = np.sum(df.memory_usage().compute()) / 1e9

    if args.output_json:
        from collections import OrderedDict
        import json
        timing_info = OrderedDict({})

        timing_info['git_commit'] = utils.capture_multiline_output(
            'git show | head -1')[0]

Example #14

Show file

File: try1.py Project: neha-agarwal-github/shared-code-1

# In[2]:


cluster=LocalCluster(n_workers=10)
client=Client(cluster)


# In[3]:


fs=gcsfs.GCSFileSystem(token='anon')
f=fs.open("anaconda-public-data/nyc-taxi/nyc.parquet/part.0.parquet")
pf=ParquetFile(f)
df1=pf.to_pandas()
df2=dd.from_pandas(data=df1,npartitions=3)
df_clust=client.persist(df2)


# In[41]:


df_clust.head(5)


# In[40]:


df3=df_clust.groupby(df_clust.passenger_count).trip_distance.mean().compute


# In[39]: