Exemple #1
0
def benchmark(fn, args, filetype=None):
    """Benchmark when "fn" function gets called on "args" tuple.
    "args" may have a Kwargs instance at the end.
    If "filetype" is provided, it may be used to convert columns to
    categorical dtypes after reading (the "loading" is assumed).
    """
    posargs = list(args)
    kwargs = {}
    # Remove Kwargs instance at end of posargs list, if one exists
    if posargs and isinstance(posargs[-1], Kwargs):
        lastarg = posargs.pop()
        kwargs.update(lastarg)

    if DEBUG:
        printable_posargs = ', '.join([str(posarg.head()) if hasattr(posarg, 'head') else str(posarg) for posarg in posargs])
        printable_kwargs = ', '.join(['{}={}'.format(k, v) for k,v in kwargs.items()])
        print('DEBUG: {}({}{})'.format(fn.__name__, printable_posargs, ', '+printable_kwargs if printable_kwargs else '', flush=True))

    # Benchmark fn when run on posargs and kwargs
    start = time.time()
    res = fn(*posargs, **kwargs)

    # If we're loading data
    if filetype is not None:
        if filetype not in filetypes_storing_categories:
            opts=odict()
            if p.dftype == 'pandas':
                opts['copy']=False
            for c in p.categories:
                res[c]=res[c].astype('category',**opts)

        # Force loading (--cache=persist was provided)
        if p.dftype == 'dask' and DD_FORCE_LOAD:
            if DASK_CLIENT is not None:
                # 2017-04-28: This combination leads to a large drop in
                #   aggregation performance (both --distributed and
                #   --cache=persist were provided)
                res = DASK_CLIENT.persist(res)
                distributed.wait(res)
            else:
                if DEBUG:
                    print("DEBUG: Force-loading Dask dataframe", flush=True)
                res = res.persist()

    end = time.time()

    return end-start, res
        tar_spwtab.putcol(columnname="EFFECTIVE_BW", value=write_bandwidth)
        tar_spwtab.putcol(columnname="RESOLUTION", value=write_bandwidth)
        tar_spwtab.putcol(columnname="NUM_CHAN", value=[count] * nspw)
        t7 = time()
        print('Done, time consuming %.3fs' % (t7 - t6))

    print('Finish operating channel-average on MS')


if __name__ == '__main__':
    c = Client('172.31.99.84:8786')
    """
    This program can ONLY be run once, 
    if an exception occurs you need to delete the files in the modify_vis directory,
    and re-copy the source files to that directory, then run again.
    """
    print('cpu_count: %d' % cpu_count())
    s1 = time()

    avg_channel = [2, 4, 8, 16, 32, 64]
    lenght = len(avg_channel)

    dst = arl_path('source_data/modify_vis/day2_copy_avgchannel')
    dsts = [dst + str(avg) for avg in avg_channel]

    avg_ms = c.map(modify_ms, dsts, avg_channel)
    wait(avg_ms)

    s2 = time()
    print('Total time: %.3fs' % (s2 - s1))
Exemple #3
0
def _fit(
    model,
    params,
    X_train,
    y_train,
    X_test,
    y_test,
    additional_calls,
    fit_params=None,
    scorer=None,
    random_state=None,
):
    original_model = model
    fit_params = fit_params or {}
    client = default_client()
    rng = check_random_state(random_state)

    info = {}
    models = {}
    scores = {}

    for ident, param in enumerate(params):
        model = client.submit(_create_model, original_model, ident, **param)
        info[ident] = []
        models[ident] = model

    # assume everything in fit_params is small and make it concrete
    fit_params = yield client.compute(fit_params)

    # Convert testing data into a single element on the cluster
    # This assumes that it fits into memory on a single worker
    if isinstance(X_test, da.Array):
        X_test = client.compute(X_test)
    else:
        X_test = yield client.scatter(X_test)
    if isinstance(y_test, da.Array):
        y_test = client.compute(y_test)
    else:
        y_test = yield client.scatter(y_test)

    # Convert to batches of delayed objects of numpy arrays
    X_train, y_train = dask.persist(X_train, y_train)
    X_train = sorted(futures_of(X_train), key=lambda f: f.key)
    y_train = sorted(futures_of(y_train), key=lambda f: f.key)
    assert len(X_train) == len(y_train)

    # Order by which we process training data futures
    order = []

    def get_futures(partial_fit_calls):
        """ Policy to get training data futures

        Currently we compute once, and then keep in memory.
        Presumably in the future we'll want to let data drop and recompute.
        This function handles that policy internally, and also controls random
        access to training data.
        """
        # Shuffle blocks going forward to get uniform-but-random access
        while partial_fit_calls >= len(order):
            L = list(range(len(X_train)))
            rng.shuffle(L)
            order.extend(L)
        j = order[partial_fit_calls]
        return X_train[j], y_train[j]

    # Submit initial partial_fit and score computations on first batch of data
    X_future, y_future = get_futures(0)
    X_future_2, y_future_2 = get_futures(1)
    _models = {}
    _scores = {}
    _specs = {}

    d_partial_fit = dask.delayed(_partial_fit)
    d_score = dask.delayed(_score)
    for ident, model in models.items():
        model = d_partial_fit(model, X_future, y_future, fit_params)
        score = d_score(model, X_test, y_test, scorer)
        spec = d_partial_fit(model, X_future_2, y_future_2, fit_params)
        _models[ident] = model
        _scores[ident] = score
        _specs[ident] = spec
    _models, _scores, _specs = dask.persist(
        _models, _scores, _specs, priority={tuple(_specs.values()): -1})
    _models = {k: list(v.dask.values())[0] for k, v in _models.items()}
    _scores = {k: list(v.dask.values())[0] for k, v in _scores.items()}
    _specs = {k: list(v.dask.values())[0] for k, v in _specs.items()}
    models.update(_models)
    scores.update(_scores)
    speculative = _specs

    new_scores = list(_scores.values())
    history = []

    # async for future, result in seq:
    while True:
        metas = yield client.gather(new_scores)

        for meta in metas:
            ident = meta["model_id"]

            info[ident].append(meta)
            history.append(meta)

        instructions = additional_calls(info)
        bad = set(models) - set(instructions)

        # Delete the futures of bad models.  This cancels speculative tasks
        for ident in bad:
            del models[ident]
            del scores[ident]
            del info[ident]

        if not any(instructions.values()):
            break

        _models = {}
        _scores = {}
        _specs = {}
        for ident, k in instructions.items():
            start = info[ident][-1]["partial_fit_calls"] + 1
            if k:
                k -= 1
                model = speculative.pop(ident)
                for i in range(k):
                    X_future, y_future = get_futures(start + i)
                    model = d_partial_fit(model, X_future, y_future,
                                          fit_params)
                score = d_score(model, X_test, y_test, scorer)
                X_future, y_future = get_futures(start + k)
                spec = d_partial_fit(model, X_future, y_future, fit_params)
                _models[ident] = model
                _scores[ident] = score
                _specs[ident] = spec

        _models2, _scores2, _specs2 = dask.persist(
            _models, _scores, _specs, priority={tuple(_specs.values()): -1})
        _models2 = {
            k: v if isinstance(v, Future) else list(v.dask.values())[0]
            for k, v in _models2.items()
        }

        _scores2 = {k: list(v.dask.values())[0] for k, v in _scores2.items()}
        _specs2 = {k: list(v.dask.values())[0] for k, v in _specs2.items()}
        models.update(_models2)
        scores.update(_scores2)
        speculative = _specs2

        new_scores = list(_scores2.values())

    models = {
        k: client.submit(operator.getitem, v, 0)
        for k, v in models.items()
    }
    yield wait(models)
    scores = yield client.gather(scores)
    best = max(scores.items(), key=lambda x: x[1]["score"])

    info = defaultdict(list)
    for h in history:
        info[h["model_id"]].append(h)
    info = dict(info)

    raise gen.Return(Results(info, models, history, best))
def generate_prediction(sensor_path,
                        size=11,
                        chunk_size=500,
                        classes=21,
                        savedir=".",
                        use_dask=False,
                        client=None):
    """Yield one instance of data with raster indices
    Args:
        chunk_size: number of images per tfrecord
        size: N x N image size
        savedir: directory to save tfrecords
        use_dask: optional dask client to parallelize computation
    Returns:
        filename: tfrecords path
    """
    with rasterio.open(sensor_path) as src:
        cols, rows = np.meshgrid(np.arange(src.shape[1]),
                                 np.arange(src.shape[0]))
        results = pd.DataFrame({
            "rows": np.ravel(rows),
            "cols": np.ravel(cols)
        })

    #turn ground truth into a dataframe of coords
    print("There are {} sensor pixels in the prediction data".format(
        results.shape[0]))

    #Create chunks to write
    results["chunk"] = np.arange(len(results)) // chunk_size
    basename = os.path.splitext(os.path.basename(sensor_path))[0]
    filenames = []

    if use_dask:
        if client is None:
            raise ValueError(
                "use_dask is {} but no client specified".format(use_dask))

        for g, df in results.groupby("chunk"):
            coordinates = zip(df.rows, df.cols)
            filename = "{}/{}_{}.tfrecord".format(savedir, basename, g)

            #Submit to dask client
            fn = client.submit(_record_wrapper_,
                               sensor_path=sensor_path,
                               index_iterable=coordinates,
                               size=size,
                               classes=classes,
                               filename=filename,
                               train=False)
            filenames.append(fn)
        wait(filenames)
        filenames = [x.result() for x in filenames]

    else:
        for g, df in results.groupby("chunk"):
            filename = "{}/{}_{}.tfrecord".format(savedir, basename, g)
            coordinates = zip(df.rows, df.cols)

            #Write record
            fn = _record_wrapper_(sensor_path=sensor_path,
                                  index_iterable=coordinates,
                                  size=size,
                                  classes=classes,
                                  filename=filename,
                                  train=False)
            filenames.append(fn)

    return filenames
Exemple #5
0
        print(a)

        volume = lx*ly*lz
        print("Dataset volume is {} ({} GByte)".format(volume, (volume*8)/2**30))

        a = client.persist(a)
        a = da.concatenate([a,a], axis=1)
        a = da.concatenate([a,a], axis=1)
        a = da.concatenate([a,a], axis=1)
        a = da.concatenate([a,a], axis=1)
        a = da.concatenate([a,a], axis=1)
        # da.concatenate does not merge chunks, even if they span the entire dimension, need to rechunk:
        a = a.rechunk(chunks=(1, -1, -1))
        a = client.persist(a)
        print(a)
        wait(a)

        # Slice in Z (fast)
        start_time = timeit.default_timer()
        sliced = a[7,:,:]
        sliced = sliced.compute()
        print(timeit.default_timer() - start_time)

        # Slice in Y (fast)
        start_time = timeit.default_timer()
        sliced = a[:,7,:]
        sliced = sliced.compute()
        print(timeit.default_timer() - start_time)

        # Slice in X (extremely slow, why?)
        start_time = timeit.default_timer()
Exemple #6
0
def generate_hand_annotations(DEBUG, BASE_PATH, FILEPATH, SIZE, config, dask_client):
    
    #Generate tfrecords
    dirname = "hand_annotations/"

    annotations_file = BASE_PATH + dirname + "crops/hand_annotations.csv"

    class_file = utilities.create_classes(annotations_file)

    if DEBUG:
        tfrecords.create_tfrecords(annotations_file=annotations_file,
                                   class_file=class_file,
                                   image_min_side=config["image-min-side"],
                                   backbone_model=config["backbone"],
                                   size=SIZE,
                                   savedir=FILEPATH + dirname + "tfrecords/")
    else:

        #Collect annotation files for each tile
        annotations_file= BASE_PATH + dirname + "crops/hand_annotations.csv"
        df = pd.read_csv(annotations_file, names=["image_path","xmin","ymin","xmax","ymax","label"])

        #enforce dtype, as there might be errors
        df.xmin = df.xmin.astype(pd.Int64Dtype())
        df.ymin = df.ymin.astype(pd.Int64Dtype())
        df.xmax = df.xmax.astype(pd.Int64Dtype())
        df.ymax = df.ymax.astype(pd.Int64Dtype())

        #Randomize rows
        df = df.sample(frac=1)

        #split pandas frame into chunks
        images = df.image_path.unique()
        indices = np.arange(len(images))
        size = 500

        chunk_list = [ ]

        #Split dataframe into chunks of images and write to file
        for i in range(ceil(len(indices) / size)):
            image_indices = indices[i * size:(i * size) + size]
            selected_images = images[image_indices]
            split_frame = df[df.image_path.isin(selected_images)]
            filename = BASE_PATH + dirname + "crops/hand_annotations{}.csv".format(i)
            split_frame.to_csv(filename, header=False,index=False)
            chunk_list.append(filename)

        print(" Created {} files to create tfrecords".format(len(chunk_list)))

        #Apply create tfrecords to each
        futures = dask_client.map(
            tfrecords.create_tfrecords,
            chunk_list,
            class_file=class_file,
            image_min_side=config["image-min-side"],
            backbone_model=config["backbone"],
            size=SIZE,
            savedir=FILEPATH + dirname + "tfrecords/")

        wait(futures)
        for future in futures:
            try:
                local_annotations = future.result()
            except Exception as e:
                print("future {} failed with {}".format(future, e))
Exemple #7
0
def main(client):
    import cudf
    import dask_cudf

    product_reviews_df = read_tables()

    product_reviews_df = product_reviews_df[
        ~product_reviews_df.pr_review_content.isnull()].reset_index(drop=True)

    product_reviews_df[
        "pr_review_content"] = product_reviews_df.pr_review_content.str.lower(
        )
    product_reviews_df[
        "pr_review_content"] = product_reviews_df.pr_review_content.str.replace(
            [".", "?", "!"], [eol_char], regex=False)

    sentences = product_reviews_df.map_partitions(
        create_sentences_from_reviews)
    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    word_df = sentences.map_partitions(
        create_words_from_sentences,
        global_position_column="sentence_tokenized_global_pos",
    )

    # These files come from the official TPCx-BB kit
    # We extracted them from bigbenchqueriesmr.jar
    neg_sent_df = load_sentiment_words("negativeSentiment.txt", "NEG")
    pos_sent_df = load_sentiment_words("positiveSentiment.txt", "POS")

    sent_df = cudf.concat([pos_sent_df, neg_sent_df])
    sent_df = dask_cudf.from_cudf(sent_df, npartitions=1)

    word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word")

    temp = word_sentence_sentiment.merge(
        sentences,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )

    temp = temp[["review_idx_global_pos", "word", "sentiment", "sentence"]]
    product_reviews_df = product_reviews_df[["pr_item_sk", "pr_review_sk"]]
    product_reviews_df["pr_review_sk"] = product_reviews_df[
        "pr_review_sk"].astype("int32")

    final = temp.merge(
        product_reviews_df,
        how="inner",
        left_on="review_idx_global_pos",
        right_on="pr_review_sk",
    )

    final = final.rename(
        columns={
            "pr_item_sk": "item_sk",
            "sentence": "review_sentence",
            "word": "sentiment_word",
        })
    keepcols = ["item_sk", "review_sentence", "sentiment", "sentiment_word"]
    final = final[keepcols].persist()
    # with sf100, there are 3.2M postive and negative review sentences(rows)
    final = final.sort_values(by=keepcols)
    final = final.persist()
    wait(final)
    return final
Exemple #8
0
def main(client, config):
    import cudf
    import dask_cudf

    date_dim_df, store_returns_df, web_returns_df, product_reviews_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    # filter date table
    date_dim_df = date_dim_df.merge(
        date_dim_df, on=["d_week_seq"], how="outer", suffixes=("", "_r")
    )
    date_dim_df = date_dim_df[date_dim_df.d_date_r.isin(q19_returns_dates)].reset_index(
        drop=True
    )

    date_dim_df = date_dim_df[["d_date_sk"]].drop_duplicates()
    sr_merged_df = store_returns_df.merge(
        date_dim_df,
        left_on=["sr_returned_date_sk"],
        right_on=["d_date_sk"],
        how="inner",
    )
    sr_merged_df = sr_merged_df[["sr_item_sk", "sr_return_quantity"]]
    sr_grouped_df = (
        sr_merged_df.groupby(["sr_item_sk"])
        .agg({"sr_return_quantity": "sum"})
        .reset_index()
        .rename(columns={"sr_return_quantity": "sr_item_qty"})
    )
    sr_grouped_df = sr_grouped_df[sr_grouped_df["sr_item_qty"] > 0]

    wr_merged_df = web_returns_df.merge(
        date_dim_df,
        left_on=["wr_returned_date_sk"],
        right_on=["d_date_sk"],
        how="inner",
    )
    wr_merged_df = wr_merged_df[["wr_item_sk", "wr_return_quantity"]]
    wr_grouped_df = (
        wr_merged_df.groupby(["wr_item_sk"])
        .agg({"wr_return_quantity": "sum"})
        .reset_index()
        .rename(columns={"wr_return_quantity": "wr_item_qty"})
    )
    wr_grouped_df = wr_grouped_df[wr_grouped_df["wr_item_qty"] > 0].reset_index(
        drop=True
    )

    sr_wr_merged_df = sr_grouped_df.merge(
        wr_grouped_df, left_on=["sr_item_sk"], right_on=["wr_item_sk"], how="inner"
    )
    sr_wr_merged_df = sr_wr_merged_df[["sr_item_sk", "sr_item_qty", "wr_item_qty"]]

    product_reviews_df = product_reviews_df[
        ~product_reviews_df.pr_review_content.isnull()
    ].reset_index(drop=True)

    product_reviews_df["pr_item_sk"] = product_reviews_df["pr_item_sk"].astype("int32")
    sr_wr_merged_df["sr_item_sk"] = sr_wr_merged_df["sr_item_sk"].astype("int32")

    merged_df = product_reviews_df.merge(
        sr_wr_merged_df, left_on=["pr_item_sk"], right_on=["sr_item_sk"], how="inner"
    )
    cols_keep = [
        "pr_item_sk",
        "pr_review_content",
        "pr_review_sk",
        "sr_item_qty",
        "wr_item_qty",
    ]
    merged_df = merged_df[cols_keep]
    merged_df["tolerance_flag"] = (
        (merged_df["sr_item_qty"] - merged_df["wr_item_qty"])
        / ((merged_df["sr_item_qty"] + merged_df["wr_item_qty"]) / 2)
    ).abs() <= 0.1
    merged_df = merged_df[merged_df["tolerance_flag"] == True].reset_index(drop=True)
    merged_df = merged_df[["pr_item_sk", "pr_review_content", "pr_review_sk"]]
    merged_df["pr_review_content"] = merged_df.pr_review_content.str.lower()
    merged_df["pr_review_content"] = merged_df.pr_review_content.str.replace(
        [".", "?", "!"], [eol_char], regex=False
    )

    sentences = merged_df.map_partitions(create_sentences_from_reviews)

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    word_df = sentences.map_partitions(
        create_words_from_sentences,
        global_position_column="sentence_tokenized_global_pos",
    )

    # This file comes from the official TPCx-BB kit
    # We extracted it from bigbenchqueriesmr.jar
    sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] + ["sentiment_files"])
    with open(f"{sentiment_dir}/negativeSentiment.txt") as fh:
        negativeSentiment = list(map(str.strip, fh.readlines()))
        # dedupe for one extra record in the source file
        negativeSentiment = list(set(negativeSentiment))

    sent_df = cudf.DataFrame({"word": negativeSentiment})
    sent_df["sentiment"] = "NEG"
    sent_df = dask_cudf.from_cudf(sent_df, npartitions=1)

    word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word")

    merged_df["pr_review_sk"] = merged_df["pr_review_sk"].astype("int32")

    temp = word_sentence_sentiment.merge(
        sentences,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )

    temp = temp[["review_idx_global_pos", "word", "sentiment", "sentence"]]
    merged_df = merged_df[["pr_item_sk", "pr_review_sk"]]

    final = temp.merge(
        merged_df, how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk"
    )
    final = final.rename(
        columns={
            "pr_item_sk": "item_sk",
            "sentence": "review_sentence",
            "word": "sentiment_word",
        }
    )
    keepcols = ["item_sk", "review_sentence", "sentiment", "sentiment_word"]
    final = final[keepcols]
    final = final.persist()
    final = final.sort_values(by=keepcols)
    wait(final)
    return final
Exemple #9
0
    def _kneighbors(self, X, k):
        """
        Internal function to query the kNN model.
        :param X:
        :param k:
        :return:
        """
        client = default_client()
        if k is None:
            k = self.n_neighbors

        # Break apart Dask.array/dataframe into chunks/parts
        data_parts = X.to_delayed()

        parts = list(map(delayed, data_parts))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)
        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])

        """
        Create IP Handles on each worker hosting input data
        """
        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker, client.submit(input_to_device_arrays, part,
                                                  {"k": k}, workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)

        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """
        exec_node, model = self.model

        # Need to fetch coefficient parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(filter(lambda x: x[0] != exec_node,
                                    input_devarrays))

        ipc_handles = [client.submit(get_input_ipc_handles, future,
                                     workers=[a_worker])
                       for a_worker, future in not_on_worker]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython

        run = client.submit(_kneighbors_on_worker, (ipc_handles, raw_arrays),
                            model, {"k": k}, workers=[exec_node])
        yield wait(run)

        dfs = [client.submit(build_dask_dfs, f, {"k": k}, workers=[worker])
               for worker, f in input_devarrays]
        yield wait(dfs)

        return gen.Return(dfs)
Exemple #10
0
def bfs(graph, start, return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.

    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : cudf.DataFrame
        df['vertex'][i] gives the vertex id of the i'th vertex

        df['distance'][i] gives the path distance for the i'th vertex from the
        starting vertex (Only if return_distances is True)

        df['predecessor'][i] gives for the i'th vertex the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf)
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    if (graph.local_data is not None and graph.local_data['by'] == 'src'):
        data = graph.local_data['data']
    else:
        data = get_local_data(graph, by='src')

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute()
        start = start.iloc[0]

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_bfs,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  start,
                                  return_distances,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    df = result[0].result()

    if graph.renumbered:
        df = graph.unrenumber(df, 'vertex').compute()
        df = graph.unrenumber(df, 'predecessor').compute()
        df["predecessor"].fillna(-1, inplace=True)

    return df
Exemple #11
0
    def _fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """

        n_cols = X.shape[1]

        data = DistributedDataHandler.create(data=X, client=self.client)
        self.datatype = data.datatype

        if "svd_solver" in self.kwargs \
                and self.kwargs["svd_solver"] == "tsqr":
            comms = Comms(comms_p2p=True)
        else:
            comms = Comms(comms_p2p=False)

        comms.init(workers=data.workers)

        data.calculate_parts_to_sizes(comms)

        worker_info = comms.worker_info(comms.worker_addresses)
        parts_to_sizes, _ = parts_to_ranks(self.client, worker_info,
                                           data.gpu_futures)

        total_rows = data.total_rows

        models = dict([(data.worker_info[wf[0]]["rank"],
                        self.client.submit(self._create_model,
                                           comms.sessionId,
                                           self._model_func,
                                           self.datatype,
                                           **self.kwargs,
                                           pure=False,
                                           workers=[wf[0]]))
                       for idx, wf in enumerate(data.worker_to_parts.items())])

        pca_fit = dict([
            (wf[0],
             self.client.submit(DecompositionSyncFitMixin._func_fit,
                                models[data.worker_info[wf[0]]["rank"]],
                                wf[1],
                                total_rows,
                                n_cols,
                                parts_to_sizes,
                                data.worker_info[wf[0]]["rank"],
                                _transform,
                                pure=False,
                                workers=[wf[0]]))
            for idx, wf in enumerate(data.worker_to_parts.items())
        ])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self._set_internal_model(list(models.values())[0])

        if _transform:
            out_futures = flatten_grouped_results(self.client,
                                                  data.gpu_futures, pca_fit)
            return to_output(out_futures, self.datatype)

        return self
Exemple #12
0
    def fit(self, X, y):
        """
        Fit the input data with a Random Forest regression model

        IMPORTANT: X is expected to be partitioned with at least one partition
        on each Dask worker being used by the forest (self.workers).

        When persisting data, you can use
        cuml.dask.common.utils.persist_across_workers to simplify this::

            X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers)
            y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers)
            X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client,
                                                              [X_dask_cudf,
                                                               y_dask_cudf])

        (this is equivalent to calling `persist` with the data and workers)::
            X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf,
                                                            y_dask_cudf],
                                                           workers={
                                                           X_dask_cudf=workers,
                                                           y_dask_cudf=workers
                                                           })
        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of training examples.

        y : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, 1)
            Labels of training examples.
            y must be partitioned the same way as X
        """
        c = default_client()

        X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X))
        y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y))

        X_partition_workers = [w for w, xc in X_futures.items()]
        y_partition_workers = [w for w, xc in y_futures.items()]

        if set(X_partition_workers) != set(self.workers) or \
           set(y_partition_workers) != set(self.workers):
            raise ValueError("""
              X is not partitioned on the same workers expected by RF\n
              X workers: %s\n
              y workers: %s\n
              RF workers: %s
            """ % (str(X_partition_workers), str(y_partition_workers),
                   str(self.workers)))

        futures = list()
        for w, xc in X_futures.items():
            futures.append(
                c.submit(
                    RandomForestRegressor._fit,
                    self.rfs[w],
                    xc,
                    y_futures[w],
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        return self
Exemple #13
0
    def __init__(self,
                 n_estimators=10,
                 max_depth=-1,
                 max_features="auto",
                 n_bins=8,
                 split_algo=1,
                 split_criterion=2,
                 bootstrap=True,
                 bootstrap_features=False,
                 verbose=False,
                 min_rows_per_node=2,
                 rows_sample=1.0,
                 max_leaves=-1,
                 n_streams=4,
                 accuracy_metric="mse",
                 min_samples_leaf=None,
                 min_weight_fraction_leaf=None,
                 n_jobs=None,
                 max_leaf_nodes=None,
                 min_impurity_decrease=None,
                 min_impurity_split=None,
                 oob_score=None,
                 random_state=None,
                 warm_start=None,
                 class_weight=None,
                 quantile_per_tree=False,
                 criterion=None,
                 workers=None):

        unsupported_sklearn_params = {
            "criterion": criterion,
            "min_samples_leaf": min_samples_leaf,
            "min_weight_fraction_leaf": min_weight_fraction_leaf,
            "max_leaf_nodes": max_leaf_nodes,
            "min_impurity_decrease": min_impurity_decrease,
            "min_impurity_split": min_impurity_split,
            "oob_score": oob_score,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "warm_start": warm_start,
            "class_weight": class_weight,
        }

        for key, vals in unsupported_sklearn_params.items():
            if vals is not None:
                raise TypeError(
                    " The Scikit-learn variable ",
                    key,
                    " is not supported in cuML,"
                    " please read the cuML documentation for"
                    " more information",
                )

        self.n_estimators = n_estimators
        self.n_estimators_per_worker = list()

        c = default_client()
        if workers is None:
            workers = c.has_what().keys()
        self.workers = workers
        n_workers = len(workers)
        if n_estimators < n_workers:
            raise ValueError(
                "n_estimators cannot be lower than number of dask workers.")

        n_est_per_worker = math.floor(n_estimators / n_workers)

        for i in range(n_workers):
            self.n_estimators_per_worker.append(n_est_per_worker)

        remaining_est = n_estimators - (n_est_per_worker * n_workers)

        for i in range(remaining_est):
            self.n_estimators_per_worker[i] = (
                self.n_estimators_per_worker[i] + 1)

        seeds = list()
        seeds.append(0)
        for i in range(1, len(self.n_estimators_per_worker)):
            sd = self.n_estimators_per_worker[i - 1] + seeds[i - 1]
            seeds.append(sd)

        key = str(uuid1())
        self.rfs = {
            worker: c.submit(
                RandomForestRegressor._func_build_rf,
                self.n_estimators_per_worker[n],
                max_depth,
                n_streams,
                max_features,
                n_bins,
                split_algo,
                split_criterion,
                bootstrap,
                bootstrap_features,
                verbose,
                min_rows_per_node,
                rows_sample,
                max_leaves,
                accuracy_metric,
                quantile_per_tree,
                seeds[n],
                key="%s-%s" % (key, n),
                workers=[worker],
            )
            for n, worker in enumerate(workers)
        }

        rfs_wait = list()
        for r in self.rfs.values():
            rfs_wait.append(r)

        wait(rfs_wait)
        raise_exception_from_futures(rfs_wait)
Exemple #14
0
    def __init__(self, dask_client, op_constructor, op_args, chunks, **kwargs):
        """
        Dask Operator constructor
        
        :param dask_client: [no default] - DaskClient;
            client object to use when submitting tasks (see dask_util module)
        :param op_constructor: [no default] - pointer to function or list of pointers to functions;
            Pointer to constructor(s)
        :param op_args: [no default] - list;
            List containing lists of arguments to run the constructor.
            It can instantiate the same operator on multiple workers or different ones if requested
            by passing a list of list of arguments (e.g., [(arg1,arg2,arg3,...)])
            If op_kind = blocky the order is column wise
        :param chunks: [no default] - list;
            List defining how many operators wants to instantiated.
            Note, the list must contain the same number of elements as the number of
            Dask workers present in the DaskClient.
        :param op_kind: [diag] - string;
            Mode to run the Dask Operator,
            diag = block diagonal operator
            blocky = blocky opearator (note: len(op_args) must be equal to np.sum(chunks)**2)
        :param setbackground_func_name: [None] - string;
            Name of the function to set the model point on which the Jacobian is computed.
            See NonLinearOperator in operator module.
        :param spread_op: [None] - DaskSpreadOp;
            Spreading operator to distribute a model vector to the set_background functions
        :param set_aux_name: [None] - string;
            Name of the function to set the auxiliary vector. Useful for VpOperator.
        :param spread_op_aux: [None] - DaskSpreadOp;
            Spreading operator to distribute an auxiliary vector to the set_aux functions
        """
        # Client to submit tasks
        if not isinstance(dask_client, DaskClient):
            raise TypeError("Passed client is not a Dask Client object!")
        if not isinstance(op_args, list):
            raise TypeError("Passed operator arguments not a list!")
        self.dask_client = dask_client
        self.client = self.dask_client.getClient()
        wrkIds = self.dask_client.getWorkerIds()
        N_wrk = self.dask_client.getNworkers()
        # Check if number of provided chunks is the same as workers
        if len(chunks) != N_wrk:
            raise ValueError(
                "Number of provide chunks (%s) different than the number of workers (%s)"
                % (len(chunks), N_wrk))
        # Check whether it is a blocky or block diagonal Dask operator
        self.op_kind = kwargs.get("op_kind", "diag")
        if self.op_kind not in "diag blocky":
            raise ValueError("Unknown op_kind provided (%s)" % self.op_kind)
        # Check if many arguments are passed to construct different operators
        N_args = len(op_args)
        N_ops = int(np.sum(chunks)) if self.op_kind == "diag" else int(
            np.sum(chunks))**2
        if N_args > 1:
            if N_args != N_ops:
                raise ValueError(
                    "Number of lists of arguments (%s) different than the number of requested operators (%s)"
                    % (N_args, N_ops))
        else:
            if N_ops > 1:
                op_args = [op_args for ii in range(N_ops)]

        # Instantiation of the operators on each worker
        self.dask_ops = []
        self.dask_ops_adj = []
        # Check if a list of constructors has been passed
        if isinstance(op_constructor, list):
            opt_list = op_constructor
        else:
            opt_list = [op_constructor] * N_ops
        self.n_col = 1 if self.op_kind == "diag" else int(np.sum(chunks))
        # Creating list of adjoint operators
        if self.n_col > 1:
            opt_list_adj = opt_list.copy()
            op_args_adj = op_args.copy()
            # Creating adjoint operators
            for iwrk, wrkId in enumerate(wrkIds):
                for iop in range(chunks[iwrk]):
                    for i_col in range(self.n_col):
                        self.dask_ops_adj.append(
                            self.client.submit(call_constructor,
                                               opt_list_adj.pop(0),
                                               op_args_adj.pop(0),
                                               workers=[wrkId],
                                               pure=False))
        # Creating forward operators
        for i_col in range(self.n_col):
            for iwrk, wrkId in enumerate(wrkIds):
                for iop in range(chunks[iwrk]):
                    self.dask_ops.append(
                        self.client.submit(call_constructor,
                                           opt_list.pop(0),
                                           op_args.pop(0),
                                           workers=[wrkId],
                                           pure=False))
        daskD.wait(self.dask_ops)
        # Checking for errors during operators construction
        for idx, fut in enumerate(self.dask_ops):
            if fut.status == 'error':
                print("Error for dask operator %s" % idx)
                print(fut.result())
        # Creating domain and range of the Dask operator
        dom_vecs = []  # List of remote domain vectors
        rng_vecs = []  # List of remote range vectors
        op_list = self.dask_ops
        if self.n_col > 1:
            # Dealing with a blocky operator
            op_list = np.diag(
                np.asarray(self.dask_ops).reshape((self.n_col, self.n_col)).T)
        for op in op_list:
            dom_vecs.append(self.client.submit(call_getDomain, op, pure=False))
            rng_vecs.append(self.client.submit(call_getRange, op, pure=False))
        daskD.wait(dom_vecs + rng_vecs)
        _check_dask_error(dom_vecs + rng_vecs)
        self.domain = DaskVector(self.dask_client, dask_vectors=dom_vecs)
        self.range = DaskVector(self.dask_client, dask_vectors=rng_vecs)
        # Set background function name "necessary for non-linear operator Jacobian"
        self.set_background_name = kwargs.get("setbackground_func_name", None)
        if self.set_background_name:
            if self.op_kind != "diag":
                raise ValueError(
                    "Set background not currently supported for blocky operators"
                )
            if not isinstance(self.set_background_name, list):
                self.set_background_name = [self.set_background_name] * len(
                    self.dask_ops)
            # Creating a spreading operator useful for
            self.Sprd = kwargs.get("spread_op", None)
            if self.Sprd:
                if not isinstance(self.Sprd, DaskSpread):
                    raise TypeError(
                        "Provided spread_op not a DaskSpreadOp class!")
                self.model_tmp = self.Sprd.getRange().clone()
        # Set aux function name "necessary for VP operator"
        self.set_aux_name = kwargs.get("set_aux_name", None)
        if self.set_aux_name:
            if self.op_kind != "diag":
                raise ValueError(
                    "set_aux_name not currently supported for blocky operators"
                )
            if not isinstance(self.set_aux_name, list):
                self.set_aux_name = [self.set_aux_name] * len(self.dask_ops)
            # Creating a spreading operator useful
            self.SprdAux = kwargs.get("spread_op_aux", None)
            if self.SprdAux:
                if not isinstance(self.SprdAux, DaskSpread):
                    raise TypeError(
                        "Provided spread_op_aux not a DaskSpreadOp class!")
                self.tmp_aux = self.SprdAux.getRange().clone()
        return
Exemple #15
0
def _train(client,
           params,
           data,
           labels,
           sample_weight,
           dmatrix_kwargs={},
           **kwargs):
    """
    Asynchronous version of train

    See Also
    --------
    train
    """
    # Break apart Dask.array/dataframe into chunks/parts
    data_parts = data.to_delayed()
    label_parts = labels.to_delayed()
    if isinstance(data_parts, np.ndarray):
        assert data_parts.shape[1] == 1
        data_parts = data_parts.flatten().tolist()
    if isinstance(label_parts, np.ndarray):
        assert label_parts.ndim == 1 or label_parts.shape[1] == 1
        label_parts = label_parts.flatten().tolist()

    if sample_weight is not None:
        sample_weight_parts = sample_weight.to_delayed()
        if isinstance(sample_weight_parts, np.ndarray):
            assert sample_weight_parts.ndim == 1 or sample_weight_parts.shape[
                1] == 1
            sample_weight_parts = sample_weight_parts.flatten().tolist()

        # Arrange parts into pairs.  This enforces co-locality
        parts = list(
            map(delayed, zip(data_parts, label_parts, sample_weight_parts)))
    else:
        # Arrange parts into pairs.  This enforces co-locality
        parts = list(map(delayed, zip(data_parts, label_parts)))

    parts = client.compute(parts)  # Start computation in the background
    yield wait(parts)

    for part in parts:
        if part.status == 'error':
            yield part  # trigger error locally

    # Because XGBoost-python doesn't yet allow iterative training, we need to
    # find the locations of all chunks and map them to particular Dask workers
    key_to_part_dict = dict([(part.key, part) for part in parts])
    who_has = yield client.scheduler.who_has(keys=[part.key for part in parts])
    worker_map = defaultdict(list)
    for key, workers in who_has.items():
        worker_map[first(workers)].append(key_to_part_dict[key])

    ncores = yield client.scheduler.ncores()  # Number of cores per worker

    # Start the XGBoost tracker on the Dask scheduler
    host, port = parse_host_port(client.scheduler.address)
    env = yield client._run_on_scheduler(start_tracker, host.strip('/:'),
                                         len(worker_map))

    # Tell each worker to train on the chunks/parts that it has locally
    futures = [
        client.submit(train_part,
                      env,
                      assoc(params, 'nthread', ncores[worker]),
                      list_of_parts,
                      workers=worker,
                      dmatrix_kwargs=dmatrix_kwargs,
                      **kwargs)
        for worker, list_of_parts in worker_map.items()
    ]

    # Get the results, only one will be non-None
    results = yield client._gather(futures)
    result = [v for v in results if v][0]
    num_class = params.get("num_class")
    if num_class:
        result.set_attr(num_class=str(num_class))
    raise gen.Return(result)
Exemple #16
0
def main(client, settings):
    worldSize = MPI.COMM_WORLD.Get_size() - 2

    global start

    # Setup
    with h5py.File(settings['databasePath'], 'r') as h5pyFile:
        database = SVDatabase(h5pyFile, settings['refStruct'], args.names)
        wait(database.load(h5pyFile))

        names = list(database.attrs['structNames'])
        random.shuffle(names)

        splits = np.array_split(names, worldSize)

        from svreg.database import worker_load

        futures = client.map(
            worker_load,
            [settings['databasePath']] * worldSize,
            splits,
            [database.attrs['svNames']] * worldSize,
            [database.attrs['elements']] * worldSize,
            [settings['allSums']] * worldSize,
        )

        client.gather(client.compute(futures))

        evaluator = SVEvaluator(database, settings)

    regressor = SVRegressor(settings, database)
    archive = Archive(os.path.join(settings['outputPath'], 'archive'))

    costFxn = buildCostFunction(settings, len(database.attrs['natoms']),
                                sum(database.attrs['natoms'].values()))

    # Begin symbolic regression
    if args.trees is not None:
        with open(args.trees, 'r') as f:
            treeNames = [s.strip() for s in f.readlines()]

        regressor.trees = [
            MCTree.from_str(t, database.attrs['elements'],
                            regressor.svNodePool) for t in treeNames
        ]

    regressor.initializeTrees(elements=database.attrs['elements'])
    regressor.initializeOptimizers()

    print("Currently optimizing:")

    for pidx, t in enumerate(regressor.trees):
        print(pidx, t)

    print()
    print()

    N = settings['optimizerPopSize']

    rawPopulations = None
    errors = None
    costs = None

    population = Population(settings, regressor.svNodePool,
                            database.attrs['elements'])

    numCompletedTrees = 0
    maxNumTrees = settings['numRegressorSteps'] * settings['numberOfTrees']

    start = time.time()
    fxnEvals = 1
    while numCompletedTrees < maxNumTrees:

        # Remove any converged trees, update population, and print new results
        staleIndices, messages = regressor.checkStale()

        populationChanged = False

        # A tree has finished optimizing
        for staleIdx, staleMessage in zip(staleIndices, messages):
            candidate = regressor.trees[staleIdx]
            opt = regressor.optimizers[staleIdx]

            candidate.cost = opt.result.fbest

            # TODO: this might not agree perfectly with opt.result.xbest
            candidateParamsIdx = np.argmin(costs[staleIdx])
            # candidate.cost      = costs[staleIdx][candidateParamsIdx]
            err = errors[staleIdx][candidateParamsIdx]

            print()
            print()
            print("Completed tree {}:".format(staleIdx))
            print("\t", candidate.cost, candidate)
            print("Stopping criterion:", staleMessage)

            numCompletedTrees += 1

            # Log completed tree
            archive.update(candidate, candidate.cost, err, opt.result.xbest,
                           opt)

            archive.log()

            # Randomly insert into current population
            inserted = population.attemptInsert(candidate)

            if inserted:
                populationChanged = True

            # Replace completed tree with new tree

            # Make sure new tree isn't already in archive or active population
            currentRegNames = [md5Hash(t) for t in regressor.trees]

            newTree, parent1, parent2 = population.newIndividual()

            generatedNew = False
            while not generatedNew:

                inArchive = False
                inReg = False

                for t in regressor.trees:
                    if newTree == t:
                        inReg = True

                for tname in archive:
                    t = archive[tname].tree

                    if newTree == t:
                        inArchive = True

                if inArchive:
                    print("Already in archive:", newTree)
                elif inReg:
                    print("Already being optimized:", newTree)
                else:
                    generatedNew = True

                if not generatedNew:
                    newTree, parent1, parent2 = population.newIndividual()

            print("New tree:")
            print('\t', parent1)
            print('\t+')
            print('\t', parent2)
            print('\t=')
            print('\t', newTree)

            # Insert new tree into list of trees being optimized
            argsCopy = deepcopy(regressor.optimizerArgs)
            path = os.path.join(settings['outputPath'], 'outcmaes', '{}/')
            d = {'verb_filenameprefix': path.format(md5Hash(newTree))}
            d.update(regressor.optimizerArgs[-1])
            argsCopy[-1] = d

            newOpt = regressor.optimizer(newTree.populate(N=1)[0], *argsCopy)

            regressor.trees[staleIdx] = newTree
            regressor.optimizers[staleIdx] = newOpt

        if staleIndices:
            if populationChanged:
                # Print current population if it was updated
                print()
                print()
                print("Current population:")

                popCosts = [t.cost for t in population]
                argsort = np.argsort(popCosts)

                for idx in argsort:
                    print(population[idx].cost, population[idx])

                print()
            else:
                print()
                print()
                print("No new fitted trees were added to the population.")
                print()

        if staleIndices:
            print()
            print("Currently optimizing:")

            for pidx, t in enumerate(regressor.trees):
                print(pidx, t)
            print()
            print()

        # Continue optimization of currently active trees
        populationDict, rawPopulations = regressor.generatePopulationDict(N)

        graph, keys = evaluator.evaluate(regressor.trees,
                                         populationDict,
                                         N,
                                         worldSize,
                                         settings['allSums'],
                                         useGPU=settings['useGPU'])

        perWorkerResults = client.get(graph, keys,
                                      direct=True)  #, resources={'GPU': 1})

        perStructResults, perStructNames = zip(*perWorkerResults)

        perStructResults = list(
            itertools.chain.from_iterable(perStructResults))
        perStructNames = list(itertools.chain.from_iterable(perStructNames))

        perStructResults = [
            x for _, x in sorted(zip(perStructNames, perStructResults))
        ]

        energies = {struct: [] for struct in database.attrs['structNames']}
        forces = {struct: [] for struct in database.attrs['structNames']}

        counter = 0
        for struct in database.attrs['structNames']:
            res = perStructResults[counter]
            energies[struct] = [s[0] for s in res]
            forces[struct] = [s[1] for s in res]
            counter += 1

        # Save the (per-struct) errors and the single-value costs
        errors = computeErrors(settings['refStruct'], energies, forces,
                               database)

        costs = costFxn(errors)

        # Add ridge regression penalty
        penalties = np.array([
            np.linalg.norm(pop, axis=1) * settings['ridgePenalty']
            for pop in rawPopulations
        ])

        # Update optimizers
        regressor.updateOptimizers(rawPopulations, costs, penalties)

        printTreeCosts(fxnEvals,
                       [opt.result.fbest for opt in regressor.optimizers],
                       penalties, start)

        fxnEvals += 1

    print('Done')
Exemple #17
0
def launch_python_post():
    curDir = os.path.dirname(os.path.abspath(__file__))
    logger = PyPostTools.pyPostLogger()

    logger.write("Initializing WRF Python Post-Processing Program")
    #Step 1: Load program settings
    logger.write(" 1. Application Initalization")
    logger.write("  - Loading control file, python_post_control.txt")
    _pySet = PyPostSettings.PyPostSettings()
    logger.write("  - Success!")
    logger.write("  - Testing Environmental Variables")
    try:
        dask_nodes = os.environ["PYTHON_POST_NODES"]
        dask_threads = os.environ["PYTHON_POST_THREADS"]
        postDir = os.environ["PYTHON_POST_DIR"]
        targetDir = os.environ["PYTHON_POST_TARG_DIR"]
    except KeyError:
        logger.write(
            "***FAIL*** KeyError encountered while trying to access important environmental variables, abort."
        )
        sys.exit("")
    logger.write("  - Success!")
    logger.write("  - Initializing Dask (" + str(dask_nodes) +
                 " Nodes Requested), Collecting routines needed")
    _routines = Routines.Routines()
    # Start Dask Tasks
    #cLoop = IOLoop.current()
    #t = Thread(target = cLoop.start, daemon = True)
    #t.start()

    logger.write("   - Async IO Loop initialized...")

    async def f(port):
        s = Scheduler(port=scheduler_port)
        s = await s
        await s.finished()
        return 1

    asyncio.gather(f(scheduler_port))

    #asyncio.get_event_loop().run_until_complete(f(scheduler_port))

    logger.write("   - Dask Scheduler initialized (Port " +
                 str(scheduler_port) + ")...")
    dask_client = Client("tcp://" + socket.gethostname() + ":" +
                         str(scheduler_port))
    logger.write("   - Dask Client initialized...")
    logger.write("   - Writing Dask Worker Job Files...")
    with PyPostTools.cd(targetDir):
        writeFile1 = PyPostTools.write_job_file(socket.gethostname(),
                                                scheduler_port,
                                                project_name="Nowcast",
                                                queue="default",
                                                nodes=dask_nodes,
                                                wall_time=60,
                                                nProcs=1)
        writeFile2 = PyPostTools.write_worker_file(socket.gethostname(),
                                                   scheduler_port,
                                                   nProcs=1)
        if (writeFile1 == False or writeFile2 == False):
            dask_client.close()
            logger.write(
                "   - Failed to write job files, are you missing an important parameter?"
            )
            sys.exit("")
            return
        else:
            logger.write(
                "   - Dask Worker Job File Written, Submitting to Queue.")
            PyPostTools.popen("chmod +x launch-worker.sh")
            PyPostTools.popen("chmod +x dask-worker.job")
            PyPostTools.popen("qsub dask-worker.job")
    # Wait here for workers.
    logger.write("   -> Worker Job submitted to queue, waiting for workers...")
    while len(dask_client.scheduler_info()['workers']) < int(dask_nodes):
        time.sleep(2)
    logger.write("   -> Workers are now connected.")
    #logger.write("   - Adding local packages to dask workers")
    #dask_client.upload_file("PyPostTools.py")
    #dask_client.upload_file("ArrayTools.py")
    #dask_client.upload_file("Calculation.py")
    #dask_client.upload_file("ColorMaps.py")
    #dask_client.upload_file("Conversions.py")
    #dask_client.upload_file("Plotting.py")
    #dask_client.upload_file("PyPostSettings.py")
    #dask_client.upload_file("Routines.py")
    logger.write("  - Success!")
    logger.write(" 1. Done.")
    logger.write(" 2. Start Post-Processing Calculations")
    calculation_future = start_calculations(dask_client, _routines,
                                            dask_threads)
    if (calculation_future != None):
        wait(calculation_future)
        result_calc = dask_client.gather(calculation_future)[0]
        if (result_calc != 0):
            logger.write(
                "***FAIL*** An error occured in calculations method, check worker logs for more info."
            )
            logger.close()
            sys.exit("")
    logger.write(" 2. Done.")
    logger.write(" 3. Generating Figures")
    logger.write("  - Collecting files from target directory (" + targetDir +
                 ").")
    fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*"))
    logger.write("  - " + str(len(fList3)) + " files have been found.")
    logger.write(" -> Pushing run_plotting_routines() to dask.")
    fullDict = _pySet.get_full_dict()
    plotting_future = start_plotting(dask_client, fullDict, dask_threads)
    wait(plotting_future)
    result_plot = dask_client.gather(plotting_future)[0]
    if (result_plot != 0):
        logger.write(
            "***FAIL*** An error occured in plotting method, check worker logs for more info."
        )
        logger.close()
        sys.exit("")
    logger.write(" 3. Done.")
    logger.write(" 4. Final Steps")

    logger.write(" 4. Done, Closing Dask Client.")
    dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'],
                               close=True)
    dask_client.close()
    logger.write("All Steps Completed.")
    logger.write("***SUCCESS*** Program execution complete.")
    logger.close()
Exemple #18
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.
        Currently not supported.
        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank
    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    >>> Comms.destroy()
    """
    from cugraph.structure.graph import null_check

    if personalization is not None:
        raise Exception("Personalization not supported")

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex").compute()

    result = [
        client.submit(call_pagerank,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      alpha,
                      max_iter,
                      tol,
                      personalization,
                      nstart,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
def test_end_to_end():

    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    # NOTE: The LocalCUDACluster needs to be started before any imports that
    # could potentially create a CUDA context.

    import dask_cudf

    import cudf
    import numpy as np

    from dask_cuml.neighbors import NearestNeighbors as cumlKNN

    def create_df(f, m, n):
        X = np.random.rand(m, n)
        ret = cudf.DataFrame(
            [(i, X[:, i].astype(np.float32)) for i in range(n)],
            index=cudf.dataframe.RangeIndex(f * m, f * m + m, 1))
        return ret

    def get_meta(df):
        ret = df.iloc[:0]
        return ret

    # Per gpu/worker
    train_m = 500
    train_n = 25

    search_m = 10
    search_k = 15

    workers = client.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [
        client.submit(create_df, n, train_m, train_n, workers=[worker])
        for worker, n in list(zip(workers, list(range(len(workers)))))
    ]

    # Wait for completion
    wait(dfs)

    meta = client.submit(get_meta, dfs[0]).result()

    X_df = dask_cudf.from_delayed(dfs, meta=meta)
    X_pd = X_df.compute().to_pandas()

    cumlNN = cumlKNN()
    cumlNN.fit(X_df)

    sklNN = NearestNeighbors(metric="sqeuclidean")
    sklNN.fit(X_pd)

    cuml_D, cuml_I = cumlNN.kneighbors(X_df[0:search_m - 1], search_k)
    sk_D, sk_I = sklNN.kneighbors(X_pd[0:search_m], search_k)

    cuml_I_nd = np.array(cuml_I.compute().as_gpu_matrix(), dtype=sk_I.dtype)
    cuml_D_nd = np.array(cuml_D.compute().as_gpu_matrix(), dtype=sk_D.dtype)

    print(str(cuml_D_nd.dtype))
    print(str(sk_D.dtype))

    assert np.array_equal(cuml_I_nd, sk_I)
    assert np.allclose(cuml_D_nd, sk_D, atol=1e-5)

    cluster.close()
Exemple #20
0
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # FIXME: import here to prevent circular import: cugraph->louvain
    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
    # from cugraph.structure.graph import Graph

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")

    client = default_client()
    # Calling renumbering results in data that is sorted by degree
    input_graph.compute_renumber_edge_list(transposed=False)
    sorted_by_degree = True
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_louvain,
                                  Comms.get_session_id(),
                                  wf[1],
                                  num_verts,
                                  num_edges,
                                  partition_row_size,
                                  partition_col_size,
                                  vertex_partition_offsets,
                                  sorted_by_degree,
                                  max_iter,
                                  resolution,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])

    wait(result)

    (parts, modularity_score) = result[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        parts = input_graph.unrenumber(parts, "vertex")

    return parts, modularity_score
Exemple #21
0
        return dask_xgboost.predict(client, bst, X)

    # Create a context
    from dask_sql import Context, run_server
    c = Context()

    c.register_function(predict_price, "predict_price",
                        [("total_amount", np.float64),
                         ("trip_distance", np.float64),
                         ("passenger_count", np.float64)], np.float64)

    # Load the data from S3
    df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv",
                     dtype={
                         "payment_type": "UInt8",
                         "VendorID": "UInt8",
                         "passenger_count": "UInt8",
                         "RatecodeIDq": "UInt8",
                     },
                     storage_options={
                         "anon": True
                     }).persist()

    wait(df)

    c.create_table("nyc-taxi", df)

    c.sql("SELECT 1 + 1").compute()

    # Finally, spin up the dask-sql server
    run_server(context=c, client=client)
Exemple #22
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            y = y_train
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        actual_score = cumlModel.score(X_train)

        predictions = cumlModel.predict(X_train).compute()

        if input_type == "dataframe":
            X = cp.array(X_train.compute().as_gpu_matrix())
            predictions = cp.array(predictions)

            centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())
        elif input_type == "array":
            X = X_train.compute()
            centers = cumlModel.cluster_centers_

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = cp.sqrt(cp.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()
Exemple #23
0
def main(client, config):
    store_sales_df, store_returns_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    n_workers = len(client.scheduler_info()["workers"])

    ### going via repartition for split_out drop duplicates
    ### see issue: https://github.com/rapidsai/tpcx-bb-internal/issues/492

    unique_sales = store_sales_df[[
        "ss_ticket_number", "ss_customer_sk"
    ]].map_partitions(lambda df: df.drop_duplicates())
    unique_sales = unique_sales.shuffle(on=["ss_customer_sk"])
    unique_sales = unique_sales.map_partitions(lambda df: df.drop_duplicates())

    unique_sales = unique_sales.persist()
    wait(unique_sales)

    orders_count = (unique_sales.groupby(by="ss_customer_sk").agg({
        "ss_ticket_number":
        "count"
    }).reset_index())

    orders_df = (store_sales_df.groupby(by="ss_customer_sk").agg({
        "ss_item_sk":
        "count",
        "ss_net_paid":
        "sum"
    }).reset_index())

    ### free up memory no longer needed
    del store_sales_df

    orders_df = orders_df.merge(orders_count, how="inner", on="ss_customer_sk")
    orders_df = orders_df.rename(
        columns={
            "ss_customer_sk": "user_sk",
            "ss_ticket_number": "orders_count",
            "ss_item_sk": "orders_items",
            "ss_net_paid": "orders_money",
        })

    orders_df = orders_df.persist()
    wait(orders_df)
    del unique_sales

    returns_count = (store_returns_df[[
        "sr_ticket_number", "sr_customer_sk"
    ]].drop_duplicates(split_out=n_workers).groupby(by="sr_customer_sk").agg({
        "sr_ticket_number":
        "count"
    }).reset_index())
    returns_df = (store_returns_df.groupby(by="sr_customer_sk").agg({
        "sr_item_sk":
        "count",
        "sr_return_amt":
        "sum"
    }).reset_index())
    ### free up memory no longer needed
    del store_returns_df

    returns_df = returns_df.merge(returns_count,
                                  how="inner",
                                  on="sr_customer_sk")

    returns_df = returns_df.rename(
        columns={
            "sr_customer_sk": "user_sk",
            "sr_ticket_number": "returns_count",
            "sr_item_sk": "returns_items",
            "sr_return_amt": "returns_money",
        })

    returns_df = returns_df.persist()
    wait(returns_df)

    final_df = orders_df.merge(returns_df, how="left", on="user_sk")

    final_df["orderRatio"] = (final_df["returns_count"] /
                              final_df["orders_count"]).round(7)
    final_df["itemsRatio"] = (final_df["returns_items"] /
                              final_df["orders_items"]).round(7)
    final_df["monetaryRatio"] = (final_df["returns_money"] /
                                 final_df["orders_money"]).round(7)

    ratio_columns = ["orderRatio", "itemsRatio", "monetaryRatio"]
    final_df = final_df.map_partitions(remove_inf_and_nulls,
                                       column_names=ratio_columns,
                                       value=0.0)

    final_df = final_df.rename(columns={"returns_count": "frequency"})

    keep_cols = [
        "user_sk", "orderRatio", "itemsRatio", "monetaryRatio", "frequency"
    ]
    final_df = final_df[keep_cols]

    final_df = final_df.fillna(0)
    final_df = final_df.repartition(npartitions=1).persist()
    wait(final_df)

    final_df = final_df.sort_values(["user_sk"]).reset_index(drop=True)
    final_df = final_df.persist()
    wait(final_df)

    feature_cols = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]

    results_dict = get_clusters(client=client,
                                ml_input_df=final_df,
                                feature_cols=feature_cols)
    return results_dict
Exemple #24
0
def test_end_to_end(nrows, ncols, nclusters, n_parts,
                    delayed_predict, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)
        cumlLabels = cumlModel.predict(X_train, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            parts_len = n_parts
        else:
            parts_len = n_workers

        if input_type == "dataframe":
            assert cumlLabels.npartitions == parts_len
            cumlPred = cp.array(cumlLabels.compute().to_pandas().values)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            assert len(cumlLabels.chunks[0]) == parts_len
            cumlPred = cp.array(cumlLabels.compute())
            labels = cp.squeeze(y_train.compute())

        assert cumlPred.shape[0] == nrows
        assert cp.max(cumlPred) == nclusters - 1
        assert cp.min(cumlPred) == 0

        score = adjusted_rand_score(labels, cumlPred)

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()
def generate_training(sensor_path,
                      ground_truth_path,
                      size=11,
                      chunk_size=500,
                      classes=21,
                      savedir=".",
                      use_dask=False,
                      client=None):
    """Yield one instance of data with one hot labels
    Args:
        chunk_size: number of images per tfrecord
        size: N x N image size
        savedir: directory to save tfrecords
        use_dask: optional dask client to parallelize computation
    Returns:
        filename: tfrecords path
    """
    #turn ground truth into a dataframe of coords
    results = get_coordinates(ground_truth_path)
    print("There are {} label pixels in the labeled ground truth".format(
        results.shape[0]))

    #Remove unclassified pixels?
    results = results[~(results.label == 0)]

    #Create chunks to write based on a spatial block
    results["chunk"] = np.arange(len(results)) // chunk_size
    basename = os.path.splitext(os.path.basename(sensor_path))[0]
    filenames = []

    if use_dask:
        if client is None:
            raise ValueError(
                "use_dask is {} but no client specified".format(use_dask))

        for g, df in results.groupby("chunk"):
            coordinates = zip(df.easting, df.northing)
            filename = "{}/{}_{}.tfrecord".format(savedir, basename, g)

            #Submit to dask client
            fn = client.submit(_record_wrapper_,
                               labels=df.label.values,
                               sensor_path=sensor_path,
                               coordinates=coordinates,
                               size=size,
                               classes=classes,
                               filename=filename,
                               train=True)
            filenames.append(fn)
        wait(filenames)
        filenames = [x.result() for x in filenames]

    else:
        for g, df in results.groupby("chunk"):
            filename = "{}/{}_{}.tfrecord".format(savedir, basename, g)
            coordinates = zip(df.easting, df.northing)

            #Write record
            fn = _record_wrapper_(labels=df.label.values,
                                  sensor_path=sensor_path,
                                  coordinates=coordinates,
                                  size=size,
                                  classes=classes,
                                  filename=filename,
                                  train=True)
            filenames.append(fn)

    return filenames
Exemple #26
0
def main(client, config):
    (
        store_sales_df,
        date_dim_df,
        web_sales_df,
        store_retuns_df,
        store_table_df,
        item_table_df,
    ) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    # SELECT sr_item_sk, sr_customer_sk, sr_ticket_number, sr_return_quantity
    # FROM
    # store_returns sr,
    # date_dim d2
    # WHERE d2.d_year = ${hiveconf:q21_year}
    # AND d2.d_moy BETWEEN ${hiveconf:q21_month} AND ${hiveconf:q21_month} + 6 --which were returned in the next six months
    # AND sr.sr_returned_date_sk = d2.d_date_sk
    d2 = date_dim_df.query(
        f"d_year == {q21_year} and d_moy >= {q21_month} and d_moy <= {q21_month+6}",
        meta=date_dim_df._meta,
    ).reset_index(drop=True)

    part_sr = store_retuns_df.merge(d2,
                                    left_on="sr_returned_date_sk",
                                    right_on="d_date_sk",
                                    how="inner")

    cols_2_keep = [
        "sr_item_sk",
        "sr_customer_sk",
        "sr_ticket_number",
        "sr_return_quantity",
    ]

    part_sr = part_sr[cols_2_keep]

    part_sr = part_sr.persist()
    wait(part_sr)

    # SELECT
    # ws_item_sk, ws_bill_customer_sk, ws_quantity
    # FROM
    # web_sales ws,
    # date_dim d3
    # WHERE d3.d_year BETWEEN ${hiveconf:q21_year} AND ${hiveconf:q21_year} + 2 -- in the following three years (re-purchased by the returning customer afterwards through
    # the web sales channel)
    #   AND ws.ws_sold_date_sk = d3.d_date_sk
    # ) part_ws
    d3 = date_dim_df.query(
        f"d_year >= {q21_year} and d_year <= {q21_year + 2}",
        meta=date_dim_df._meta)
    part_ws = web_sales_df.merge(d3,
                                 left_on="ws_sold_date_sk",
                                 right_on="d_date_sk",
                                 how="inner")
    cols_2_keep = ["ws_item_sk", "ws_bill_customer_sk", "ws_quantity"]
    part_ws = part_ws[cols_2_keep]
    part_ws = part_ws.persist()
    wait(part_ws)

    # part_ws ON (
    # part_sr.sr_item_sk = part_ws.ws_item_sk
    # AND part_sr.sr_customer_sk = part_ws.ws_bill_customer_sk
    part_ws_part_sr_m = hash_merge(
        lhs=part_sr,
        rhs=part_ws,
        left_on=["sr_item_sk", "sr_customer_sk"],
        right_on=["ws_item_sk", "ws_bill_customer_sk"],
        how="inner",
    )

    cols_2_keep = [
        "sr_item_sk",
        "sr_customer_sk",
        "sr_ticket_number",
        "sr_return_quantity",
        "ws_quantity",
    ]
    part_ws_part_sr_m = part_ws_part_sr_m[cols_2_keep]

    part_ws_part_sr_m = part_ws_part_sr_m.persist()
    wait(part_ws_part_sr_m)
    del part_sr, part_ws
    # SELECT ss_item_sk, ss_store_sk, ss_customer_sk, ss_ticket_number, ss_quantity
    # FROM
    # store_sales ss,
    # date_dim d1
    # WHERE d1.d_year = ${hiveconf:q21_year}
    # AND d1.d_moy = ${hiveconf:q21_month}
    # AND ss.ss_sold_date_sk = d1.d_date_sk
    # ) part_ss
    d1 = date_dim_df.query(f"d_year == {q21_year} and d_moy == {q21_month} ",
                           meta=date_dim_df._meta)

    part_ss = store_sales_df.merge(d1,
                                   left_on="ss_sold_date_sk",
                                   right_on="d_date_sk",
                                   how="inner")

    cols_2_keep = [
        "ss_item_sk",
        "ss_store_sk",
        "ss_customer_sk",
        "ss_ticket_number",
        "ss_quantity",
    ]
    part_ss = part_ss[cols_2_keep]

    # part_ss ON (
    # part_ss.ss_ticket_number = part_sr.sr_ticket_number
    # AND part_ss.ss_item_sk = part_sr.sr_item_sk
    # AND part_ss.ss_customer_sk = part_sr.sr_customer_sk

    part_ws_part_sr_m_part_ss_join_df = hash_merge(
        lhs=part_ss,
        rhs=part_ws_part_sr_m,
        left_on=["ss_ticket_number", "ss_item_sk", "ss_customer_sk"],
        right_on=["sr_ticket_number", "sr_item_sk", "sr_customer_sk"],
        how="inner",
    )
    cols_2_keep = [
        "ss_store_sk",
        "ss_quantity",
        "sr_return_quantity",
        "ws_quantity",
        "ss_item_sk",
    ]
    part_ws_part_sr_m_part_ss_join_df = part_ws_part_sr_m_part_ss_join_df[
        cols_2_keep]

    # INNER JOIN store part_s ON (
    #  part_s.s_store_sk = part_ss.ss_store_sk
    # )
    part_ws_part_sr_m_part_ss_part_s_join_df = store_table_df.merge(
        part_ws_part_sr_m_part_ss_join_df,
        left_on="s_store_sk",
        right_on="ss_store_sk",
        how="inner",
    )

    cols_2_keep = [
        "s_store_name",
        "sr_return_quantity",
        "ss_quantity",
        "ws_quantity",
        "s_store_id",
        "ss_item_sk",
    ]
    part_ws_part_sr_m_part_ss_part_s_join_df = part_ws_part_sr_m_part_ss_part_s_join_df[
        cols_2_keep]

    # INNER JOIN item part_i ON (
    # part_i.i_item_sk = part_ss.ss_item_sk
    # )
    final_df = item_table_df.merge(
        part_ws_part_sr_m_part_ss_part_s_join_df,
        left_on="i_item_sk",
        right_on="ss_item_sk",
        how="inner",
    )
    # GROUP BY
    #  part_i.i_item_id,
    #  part_i.i_item_desc,
    #  part_s.s_store_id,
    #  part_s.s_store_name
    # ORDER BY
    #  part_i.i_item_id,
    #  part_i.i_item_desc,
    #  part_s.s_store_id,
    #  part_s.s_store_name

    cols_2_keep = [
        "i_item_id",
        "i_item_desc",
        "s_store_name",
        "ss_quantity",
        "sr_return_quantity",
        "ws_quantity",
        "s_store_id",
    ]
    grouped_df = final_df[cols_2_keep]
    agg_df = grouped_df.groupby(
        by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"]).agg({
            "ss_quantity":
            "sum",
            "sr_return_quantity":
            "sum",
            "ws_quantity":
            "sum"
        })

    agg_df = agg_df.repartition(npartitions=1).persist()

    sorted_agg_df = agg_df.reset_index().map_partitions(
        lambda df: df.sort_values(
            by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"]))

    sorted_agg_df = sorted_agg_df.head(q21_limit)
    sorted_agg_df = sorted_agg_df.rename(
        columns={
            "ss_quantity": "store_sales_quantity",
            "sr_return_quantity": "store_returns_quantity",
            "ws_quantity": "web_sales_quantity",
        })
    sorted_agg_df["i_item_desc"] = sorted_agg_df["i_item_desc"].str.strip()

    return sorted_agg_df
Exemple #27
0
        d.insertDataValue("name", dims, np.arange(lx * ly * lz))

        volume = lx * ly * lz
        print("Dataset volume is {} ({} GByte)".format(volume,
                                                       (volume * 8) / 2**30))

        test = from_dataset(d, slice_dim=Dimension.Z)
        test = client.persist(test)
        test = test.concatenate(Dimension.Y, test)
        test = test.concatenate(Dimension.Y, test)
        test = test.concatenate(Dimension.Y, test)
        test = test.concatenate(Dimension.Y, test)
        test = test.concatenate(Dimension.Y, test)
        test = client.persist(test)
        #test = test.persist()
        wait(test)

        start_time = timeit.default_timer()
        sliced = test.slice(Dimension.Y, 6)
        sliced = sliced.compute()
        print(timeit.default_timer() - start_time)

        start_time = timeit.default_timer()
        sliced = test.slice(Dimension.X, 7)
        sliced = sliced.compute()
        print(timeit.default_timer() - start_time)

        start_time = timeit.default_timer()
        sliced = test.slice(Dimension.X, 8)
        sliced = sliced.compute()
        print(timeit.default_timer() - start_time)
Exemple #28
0
def dump_write(render_folder_name,
               full_volume_shape,
               dtype,
               color_channel_count,
               output_file_name,
               tile_hash,
               leaf_level_count,
               tile_level_count,
               compression_method,
               compression_options,
               output_file_type,
               do_use_simple_for_loop=False):
    # dumps volumetric data into h5/n5/zarr
    #self.inputLoc = inputloc

    tile_shape = (full_volume_shape / (2**tile_level_count)).astype(int)
    leaf_shape = (full_volume_shape / (2**leaf_level_count)).astype(int)

    # check if dataset name is provided
    splitted_name = output_file_name.split(':')
    if  len(splitted_name) == 1:
        output_file_name =  splitted_name[0]
        dataset_name =  "volume"
    elif len(splitted_name) ==2:
        output_file_name =  splitted_name[0]
        dataset_name =  splitted_name[1]
    else:
        raise ValueError('output file name has more than one ":"', output_file_name)
    #self.setting = setting
    #self.tilelist = tilelist
    tile_id_list = list(tile_hash.keys())
    leaf_ids_per_tile_list = list(tile_hash.values())

    # # Unpack the settings
    # volSize = tuple(map(int,setting['volSize']))
    # tileSize = setting['tileSize']
    # #volReference = setting['volReference']
    # depthFull = setting['depthFull']
    # depthBase = setting['depthBase']
    # leafSize = setting['leaf_shape']
    # dtype = setting['dtype']
    # chunkSize = tuple(map(int,setting['chunkSize']))
    # compression_method = setting['compression']
    # comp_opts = setting['compression_opts']
    chunk_shape = tile_shape
    full_volume_shape_including_color_channel = np.append(full_volume_shape, color_channel_count)  # append color channel
    chunk_shape_including_color_channel = np.append(chunk_shape, color_channel_count)
    full_volume_shape_with_color_channels_as_tuple = tuple(map(int, full_volume_shape_including_color_channel))
    chunk_shape_with_color_as_tuple = tuple(map(int, chunk_shape_including_color_channel))

    if output_file_type=='h5':
        # write into h5
        with h5py.File(output_file_name, "w") as f:
            # dset_swc = f.create_dataset("reconstruction", (xyz_shifted.shape[0], 7), dtype='f')
            # for iter, xyz_ in enumerate(xyz_shifted):
            #     dset_swc[iter, :] = np.array(
            #         [edges[iter, 0].__int__(), 1, xyz_[0], xyz_[1], xyz_[2], 1.0, edges[iter, 1].__int__()])
            dataset = f.create_dataset(dataset_name,
                                       full_volume_shape_with_color_channels_as_tuple,
                                       dtype=dtype,
                                       chunks=chunk_shape_with_color_as_tuple,
                                       compression=compression_method,
                                       compression_opts=compression_options)


            # crop chuncks from a tile read in tilelist
            for iter, tile_id in enumerate(tile_id_list):
                print('{} : {} out of {}'.format(tile_id, iter+1, len(tile_id_list)))
                leaf_id_within_tile = tile_hash[tile_id]
                dump_single_tile_id(tile_id,
                                    leaf_id_within_tile,
                                    render_folder_name,
                                    tile_shape,
                                    leaf_shape,
                                    chunk_shape_with_color_as_tuple,
                                    dtype,
                                    dataset, 
                                    is_dataset_transposed=False)
    elif output_file_type=='n5' or output_file_type=='zarr':
        # write into z5 or n5
        if do_use_simple_for_loop:
            use_zarr_format = (output_file_type == 'zarr')
            with z5py.File(output_file_name, 'a', use_zarr_format=use_zarr_format) as f:
                # require_dataset seems to choke on the compression_options {level: 9}, so this is a workaround
                g = f.require_group('/')
                try:
                    dataset = g[dataset_name]
                except KeyError:
                    dataset = f.create_dataset(dataset_name,
                                               shape=tuple(reversed(full_volume_shape_with_color_channels_as_tuple)),
                                               dtype=dtype,
                                               chunks=tuple(reversed(chunk_shape_with_color_as_tuple)),
                                               compression=compression_method,
                                               **compression_options)                    
                for tile_id in tqdm.tqdm(tile_id_list):
                    leaf_ids_within_tile = tile_hash[tile_id]
                    dump_single_tile_id(tile_id,
                                        leaf_ids_within_tile,
                                        render_folder_name,
                                        tile_shape,
                                        leaf_shape,
                                        chunk_shape_with_color_as_tuple,
                                        dtype,
                                        dataset,
                                        is_dataset_transposed=True)
        else:
            username = getpass.getuser()
            scratch_folder_path = '/scratch/%s' % username
            with LSFCluster(cores=2, memory='30 GB', local_dir=scratch_folder_path, projectstr='mouselight', queue='normal', extralist='-o /dev/null -e /dev/null') as cluster:
                cluster.adapt(minimum=1, maximum=1000)
                #cluster = LocalCluster(n_workers=4, threads_per_worker=1)
                #cluster.scale(200)
                with Client(cluster) as client:
                    use_zarr_format = (output_file_type=='zarr')
                    with z5py.File(output_file_name, 'a', use_zarr_format=use_zarr_format) as f:
                        # require_dataset seems to choke on the compression_options {level: 9}, so this is a workaround
                        g = f.require_group('/')
                        try:
                            dataset = g[dataset_name]
                        except KeyError:                        
                            dataset = f.create_dataset(dataset_name,
                                                       shape=tuple(reversed(full_volume_shape_with_color_channels_as_tuple)),
                                                       dtype=dtype,
                                                       chunks=tuple(reversed(chunk_shape_with_color_as_tuple)),
                                                       compression=compression_method,
                                                       **compression_options)
                        two_arg_dump_single_tile_id = \
                            partial(dump_single_tile_id,
                                    rendered_folder_path=render_folder_name,
                                    tile_shape=tile_shape,
                                    leaf_shape=leaf_shape,
                                    chunk_shape_with_color_as_tuple=chunk_shape_with_color_as_tuple,
                                    dtype=dtype,
                                    dataset=dataset,
                                    is_dataset_transposed=True)
                        #with Pool(16) as pool :
                        #    foo = list(tqdm.tqdm(pool.imap(f, tile_id_list), total=len(tile_id_list)))
                        # for tile_id in tqdm.tqdm(tile_id_list):
                        #     leaf_id_within_tile = tile_hash[tile_id]
                        #     f(tile_id, leaf_id_within_tile)
                        print('About to process %d tiles' % len(tile_id_list))
                        futures = client.map(two_arg_dump_single_tile_id, tile_id_list, leaf_ids_per_tile_list, retries=2)
                        progress(futures, notebook=False)  # need notebook=False when running in Spyder
                        wait(futures)  # just to make sure...
                        print('')
                        print('All Dask jobs have exited')
                        print('')
                        print('futures:')
                        print(futures)
Exemple #29
0
def polish(client, settings):
    worldSize = MPI.COMM_WORLD.Get_size() - 2

    # Setup
    with h5py.File(settings['databasePath'], 'r') as h5pyFile:
        database = SVDatabase(h5pyFile, settings['refStruct'], args.names)
        wait(database.load(h5pyFile))

        names = list(database.attrs['structNames'])
        random.shuffle(names)

        splits = np.array_split(names, worldSize)

        from svreg.database import worker_load

        futures = client.map(
            worker_load,
            [settings['databasePath']] * worldSize,
            splits,
            [database.attrs['svNames']] * worldSize,
            [database.attrs['elements']] * worldSize,
            [settings['allSums']] * worldSize,
        )

        client.gather(client.compute(futures))

        evaluator = SVEvaluator(database, settings)

    regressor = SVRegressor(settings, database)

    costFxn = buildCostFunction(settings, len(database.attrs['natoms']),
                                sum(database.attrs['natoms'].values()))

    if args.trees is not None:
        with open(args.trees, 'r') as f:
            treeNames = [s.strip() for s in f.readlines()]

        regressor.trees = [
            MCTree.from_str(t, database.attrs['elements'],
                            regressor.svNodePool) for t in treeNames
        ]

    else:
        from svreg.nodes import FunctionNode

        tree = MCTree(['Al'])

        from copy import deepcopy

        treeAl = SVTree()
        treeAl.nodes = [
            FunctionNode('add'),
            FunctionNode('global'),
            deepcopy(regressor.svNodePool[1]),
            FunctionNode('add'),
            deepcopy(regressor.svNodePool[0]),
            # FunctionNode('softplus'),
            # FunctionNode('add'),
            # deepcopy(regressor.svNodePool[0]),
            # FunctionNode('softplus'),
            FunctionNode('add'),
            deepcopy(regressor.svNodePool[0]),
            deepcopy(regressor.svNodePool[1]),
        ]

        tree.chemistryTrees['Al'] = treeAl
        tree.updateSVNodes()

        regressor.trees = [tree]

    # tree = MCTree.from_file(
    #     '/home/jvita/scripts/svreg/results/alznmg/al_lnames_conv/e799d5bc09fd37dba4e05f45a6c00e57/tree_file.pot',
    #     database.attrs['elements'],
    #     regressor.svNodePool
    # )

    # tree.updateSVNodes()

    # regressor.trees = [tree]

    regressor.initializeOptimizers()

    savePath = os.path.join(settings['outputPath'], 'polished')

    if not os.path.isdir(savePath):
        os.mkdir(savePath)

    for tree in regressor.trees:
        print(tree)

    N = settings['optimizerPopSize']

    from svreg.archive import Entry

    entries = {md5Hash(t): Entry(t, savePath) for t in regressor.trees}

    import pickle

    optStart = time.time()
    for optStep in range(1, settings['maxNumOptimizerSteps'] + 1):

        staleIndices, messages = regressor.checkStale()
        for staleIdx, staleMessage in zip(staleIndices, messages):
            print('Completed tree {}:'.format(staleIdx))
            print("\t", regressor.optimizers[staleIdx].result.fbest,
                  regressor.trees[staleIdx])
            print("Stopping criterion:", staleMessage)

            del regressor.trees[staleIdx]
            del regressor.optimizers[staleIdx]

        populationDict, rawPopulations = regressor.generatePopulationDict(N)

        graph, keys = evaluator.evaluate(regressor.trees,
                                         populationDict,
                                         N,
                                         worldSize,
                                         settings['allSums'],
                                         useGPU=settings['useGPU'])

        perWorkerResults = client.get(graph, keys,
                                      direct=True)  #, resources={'GPU': 1})

        perStructResults, perStructNames = zip(*perWorkerResults)

        perStructResults = list(
            itertools.chain.from_iterable(perStructResults))
        perStructNames = list(itertools.chain.from_iterable(perStructNames))

        perStructResults = [
            x for _, x in sorted(zip(perStructNames, perStructResults))
        ]

        energies = {struct: [] for struct in database.attrs['structNames']}
        forces = {struct: [] for struct in database.attrs['structNames']}

        counter = 0
        for struct in database.attrs['structNames']:
            res = perStructResults[counter]
            energies[struct] = [s[0] for s in res]
            forces[struct] = [s[1] for s in res]
            counter += 1

        # Save the (per-struct) errors and the single-value costs
        errors = computeErrors(settings['refStruct'], energies, forces,
                               database)

        costs = costFxn(errors)

        # Add ridge regression penalty
        penalties = np.array([
            np.linalg.norm(pop, axis=1) * settings['ridgePenalty']
            for pop in rawPopulations
        ])

        # Update optimizers
        regressor.updateOptimizers(rawPopulations, costs, penalties)

        printTreeCosts(optStep,
                       [opt.result.fbest for opt in regressor.optimizers],
                       penalties, optStart)

        for treeNum, tree in enumerate(regressor.trees):
            opt = regressor.optimizers[treeNum]
            treeName = md5Hash(tree)

            entry = entries[treeName]

            bestIdx = np.argmin(costs[0])
            entry.bestIdx = bestIdx
            entry.cost = costs[0][bestIdx]
            entry.bestParams = rawPopulations[0][bestIdx]
            entry.bestErrors = errors[0][bestIdx]

            bestEng = {}
            bestFcs = {}
            for s in energies:
                bestEng[s] = energies[s][0][bestIdx]
                bestFcs[s] = forces[s][0][bestIdx]

            pickle.dump(
                entry, open(os.path.join(savePath, treeName, 'entry.pkl'),
                            'wb'))

            pickle.dump(
                opt, open(os.path.join(savePath, treeName, 'opt.pkl'), 'wb'))

            pickle.dump(
                bestEng,
                open(os.path.join(savePath, treeName, 'energies.pkl'), 'wb'))

            pickle.dump(
                bestFcs,
                open(os.path.join(savePath, treeName, 'forces.pkl'), 'wb'))

            pickle.dump(
                tree, open(os.path.join(savePath, treeName, 'tree.pkl'), 'wb'))
Exemple #30
0
def test_ranker(output, client, listen_port, group):

    if output == 'dataframe-with-categorical':
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
            n_features=1,
            n_informative=1
        )
    else:
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
        )

    # rebalance small dask.Array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }

    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True)
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (
        X.shape[0],
        dask_ranker.booster_.num_trees()
    )
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_ranker.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)