Example #1
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key
            for f in result.dask.values()
            } == {f.key
                  for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()
Example #2
0
def build_cli(args=None):
    if not args:
        args = parse_args()
    else:
        args = parse_args(args)
    filter_dirty = any(args.packages) or not args._all

    outputs = get_dask_outputs(args.path,
                               packages=args.packages,
                               filter_dirty=filter_dirty,
                               git_rev=args.git_rev,
                               stop_rev=args.stop_rev,
                               steps=args.steps,
                               max_downstream=args.max_downstream,
                               visualize=args.visualize,
                               test=args.test)

    if args.visualize:
        # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout)
        # graphviz_graph = nx.draw_graphviz(graph, 'dot')
        # graphviz_graph.draw(args.visualize)
        visualize(*outputs,
                  filename=args.visualize)  # create neat looking graph.
    else:
        # many threads, because this is just the dispatch.  Takes very little compute.
        # Only waiting for build complete.
        cluster = LocalCluster(n_workers=1,
                               threads_per_worker=args.threads,
                               nanny=False)
        client = Client(cluster)

        futures = client.persist(outputs)
        progress(futures)
Example #3
0
class SchedulerComputeDepsInMemory(object):

    def setup(self):
        self.client = Client()

        # Generate 10 indepdent tasks
        x = [delayed(random.random)() for _ in range(10)]
        # Generate lots of interrelated dependent tasks
        n = 200
        for _ in range(10, n):
            random_subset = [random.choice(x) for _ in range(5)]
            random_max = delayed(max)(random_subset)
            x.append(random_max)

        # Persist tasks into distributed memory and wait to finish
        y = self.client.persist(x)
        wait(y)

        self.x = x

    def teardown(self):
        self.client.close()

    def time_compute_deps_already_in_memory(self):
        """
        Measure compute time when dependent tasks are already in memory.
        xref https://github.com/dask/distributed/pull/3293
        """
        compute(*self.x, scheduler=self.client)
Example #4
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = Client((s.ip, s.port), start=False)
    yield c._start()
    f = Client((s.ip, s.port), start=False)
    yield f._start()

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c._publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f._get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)._result()
    assert out == [0, 1, 2]
    yield c._shutdown()
    yield f._shutdown()
Example #5
0
def test_persist(dsf):
    correct = dsf.compute().todense()
    client = Client()
    persisted = client.persist(dsf)

    res = persisted.compute().todense()

    pdt.assert_frame_equal(res, correct)
def dask_evaluate(outputs):
    utils.port_increment += 2
    scheduler_port = 8786 + utils.port_increment
    diagnostics_port = 8787 + utils.port_increment

    cluster = LocalCluster(n_workers=1, threads_per_worker=10, nanny=False,
                           scheduler_port=scheduler_port, diagnostics_port=diagnostics_port)
    client = Client(cluster)
    futures = client.persist(outputs)
    return client.gather(futures)
Example #7
0
def distribute_dask_df(dask_df):
    """
    Distribute a dask dataframe over a client that's accessible via
    the global DASK_CLIENT
    """
    from distributed import Client

    global DASK_CLIENT
    DASK_CLIENT = Client()

    dask_df = DASK_CLIENT.persist(dask_df)

    return dask_df
        self.cluster = Cluster(
            [self.MORPHL_SERVER_IP_ADDRESS], auth_provider=self.auth_provider)
        self.session = self.cluster.connect(self.MORPHL_CASSANDRA_KEYSPACE)

        self.prep_stmt = self.session.prepare(self.CQL_STMT)

    def save_prediction(self, client_id, prediction):
        bind_list = [client_id, prediction]
        self.session.execute(self.prep_stmt, bind_list, timeout=self.CASS_REQ_TIMEOUT)

def batch_inference_on_partition(partition_df):
    churn_model_file = f'/opt/models/{DAY_AS_STR}_{UNIQUE_HASH}_churn_model.h5'
    churn_model = load_model(churn_model_file)
    prediction = churn_model.predict(partition_df.drop(['client_id'], axis=1))[0][0]
    return prediction

def persist_partition(partition_df):
    def persist_one_prediction(series_obj):
        cassandra.save_prediction(series_obj.client_id, series_obj.prediction)
    cassandra = Cassandra()
    partition_df.apply(persist_one_prediction, axis=1)
    return 0

if __name__ == '__main__':
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    dask_df.client_id.count().compute()
    dask_df['prediction'] = dask_df.map_partitions(batch_inference_on_partition, meta=('prediction', float))
    dask_df['token'] = dask_df.map_partitions(persist_partition, meta=('token', int))
    dask_df.token.compute()
Example #9
0
# Encoding: UTF-8 -*-

import numpy as np
from distributed import Client, progress
import dask.array as da
dask_client = Client(
    scheduler_file="/global/cscratch1/sd/rkube/scheduler.json")

with np.load("../dask_fft_data_s0000.npz") as df:
    num_channels, num_fft = df["fft_data"].shape
    print(num_channels, num_fft)

    fft_data = da.from_array(df["fft_data"], chunks=(1, num_fft))
    dask_client.persist(fft_data)

# Calculate the crosspower using the array interface
res1 = (fft_data[:2, :] * fft_data[-2:, :].conj()).mean(axis=1)
print("type res1 = ", type(res1))
res2 = da.arctan2(res1.real, res1.imag).real
print("type res2 = ", type(res2))
print("result res2 = ", res2.compute())


# Calculate the crosspower using the distributed interface
def cross_phase(ft_data, ch1, ch2):

    _tmp1 = (ft_data[ch1, :] * ft_data[ch2, :].conj()).mean().compute()
    print("** crosspower: type(tmp1) =", type(_tmp1))
    _tmp2 = np.arctan2(_tmp1.real, _tmp1.imag).real
    #_tmp2 = _tmp1.real + _tmp1.imag
Example #10
0
#
# - Display head of the dataframe
# - Display number of rows of this dataframe.
# - Compute the total number of passengers.
# - Count occurrences in the payment_type column both for the full dataset, and filtered by zero tip (tip_amount == 0).
# - Create a new column, tip_fraction
# - Plot the average of the new column tip_fraction grouped by day of week.
# - Plot the average of the new column tip_fraction grouped by hour of day.
#
# [Dask dataframe documentation](http://docs.dask.org/en/latest/dataframe.html)
#

# +
import dask.dataframe as dd
from distributed import Client, progress

c = Client('127.0.0.1:8786')
nyc2014 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2014/yellow*.csv',
parse_dates=['pickup_datetime', 'dropoff_datetime'],
skipinitialspace=True)

nyc2015 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2015/yellow*.csv',
parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
nyc2014, nyc2015 = c.persist([nyc2014, nyc2015])

progress(nyc2014, nyc2015)
# -



    client = Client(LocalCluster())

    print(client._repr_html_())

    network_graph, meta_graph = create_graph(
        expression_matrix.as_matrix(),
        gene_names,
        tf_names,
        "GBM",
        SGBM_KWARGS,
        client=client,  # broadcast!
        early_stop_window_length=25,
        include_meta=True)

    # Good!
    a, b = client.persist([network_graph, meta_graph])
    network_df = a.compute(sync=True)
    meta_df = b.compute(sync=True)

    # Bad!
    # network_df, meta_df = client.compute([network_graph, meta_graph], sync=True)

    if client:
        client.close()

    network_df.to_csv(net_out_path, sep='\t', index=False)
    meta_df.to_csv(meta_out_path, sep='\t', index=False)

    end_time = time.time()

    print('wall time: {} seconds'.format(end_time - start_time))
def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model()
Example #13
0
        raise ValueError("n_files = {0} > n_workers {1}".format(
            no_of_files, n_workers))

    # start script
    h5filelist = sorted(utils.getFileList(data_dir, 'h5'))
    # print(h5filelist[:no_of_files])
    file_size = utils.getFileSizeInGB(h5filelist[:no_of_files])
    print("File size to read in is {0:.0f} GB".format(file_size))

    cluster = LocalCluster(n_workers=n_workers, threads_per_worker=n_threads)
    client = Client(cluster)
    print("Using {0} workers and {1} thread(s) per worker".format(
        n_workers, n_threads))
    df = ddf.read_hdf(h5filelist[:no_of_files], 'csv', columns=columns)
    start_time = time.time()
    df = client.persist(df)
    print(df.index.compute())
    end_time = time.time()
    read_time = end_time - start_time
    # print("lines read in =", df.shape)
    print("Reading in {0} files in {1:.0f}s ".format(no_of_files, read_time))

    df_size = np.sum(df.memory_usage().compute()) / 1e9

    if args.output_json:
        from collections import OrderedDict
        import json
        timing_info = OrderedDict({})

        timing_info['git_commit'] = utils.capture_multiline_output(
            'git show | head -1')[0]
Example #14
0
# In[2]:


cluster=LocalCluster(n_workers=10)
client=Client(cluster)


# In[3]:


fs=gcsfs.GCSFileSystem(token='anon')
f=fs.open("anaconda-public-data/nyc-taxi/nyc.parquet/part.0.parquet")
pf=ParquetFile(f)
df1=pf.to_pandas()
df2=dd.from_pandas(data=df1,npartitions=3)
df_clust=client.persist(df2)


# In[41]:


df_clust.head(5)


# In[40]:


df3=df_clust.groupby(df_clust.passenger_count).trip_distance.mean().compute


# In[39]: