Ejemplo n.º 1
0
    print("---" * 20)
    print("grouping complete ={}".format(len(grouped_df)))
    grouped_df = grouped_df.groupby(["i_item_sk"
                                     ]).sum(split_every=2).reset_index()
    grouped_df.columns = ["i_item_sk", "cnt"]
    result_df = grouped_df.map_partitions(
        lambda df: df.sort_values(by=["cnt"], ascending=False))

    result_df.columns = ["lastviewed_item", "cnt"]
    result_df["purchased_item"] = q03_purchased_item_IN
    cols_order = ["purchased_item", "lastviewed_item", "cnt"]
    result_df = result_df[cols_order]
    result_df = result_df.persist()
    ### todo: remove this later after more testing
    wait(result_df)
    print(len(result_df))
    result_df = result_df.head(q03_limit)
    print("result complete")
    print("---" * 20)
    return result_df


if __name__ == "__main__":
    from xbb_tools.cluster_startup import attach_to_cluster
    import cudf
    import dask_cudf

    client = attach_to_cluster(cli_args)

    run_dask_cudf_query(cli_args=cli_args, client=client, query_func=main)
Ejemplo n.º 2
0
        (
            cust_and_clicks_ddf["clicks_in_category"]
            > cust_and_clicks_ddf["clicks_in_category"].mean()
        )
        .reset_index(drop=True)
        .astype(np.int64)
    )

    # Converting the dataframe to float64 as cuml logistic reg requires this
    ml_input_df = cust_and_clicks_ddf.astype("float64")

    ml_input_df = ml_input_df.persist()

    ml_tasks = [delayed(build_and_predict_model)(df) for df in ml_input_df.to_delayed()]
    results_dict = client.compute(*ml_tasks, sync=True)

    return results_dict


if __name__ == "__main__":
    from xbb_tools.cluster_startup import attach_to_cluster
    import cudf
    import dask_cudf
    import cuml

    client = attach_to_cluster(cli_args)

    run_dask_cudf_query(
        cli_args=cli_args, client=client, query_func=main, write_func=write_result,
    )