Beispiel #1
0
def test_extract_partitions_futures(nrows, ncols, n_parts, X_delayed,
                                    y_delayed, colocated, cluster):

    client = Client(cluster)
    try:

        X = cp.random.standard_normal((nrows, ncols))
        y = cp.random.standard_normal((nrows, ))

        X = da.from_array(X, chunks=(nrows / n_parts, -1))
        y = da.from_array(y, chunks=(nrows / n_parts, ))

        if not X_delayed:
            X = client.persist(X)
        if not y_delayed:
            y = client.persist(y)

        if colocated:
            ddh = DistributedDataHandler.create((X, y), client)
        else:
            ddh = DistributedDataHandler.create(X, client)

        parts = list(map(lambda x: x[1], ddh.gpu_futures))
        assert len(parts) == n_parts

    finally:
        client.close()
def array_images():
    custom_imread = dask.delayed(skimage.io.imread, pure=True)
    images = [
        custom_imread(
            '/Users/nivethamahalakshmibalasamy/Documents/ECI-PolarScience/dask_stuff/grayscale-xy-%d.png'
            % i) for i in range(1376, 1396)
    ]
    #print images
    image_array = [
        da.from_delayed(i, sample.shape, sample.dtype) for i in images
    ]
    sizes = [j.shape for j in image_array]
    #print sizes
    stack = da.stack(image_array, axis=0)
    print stack
    #print stack[0]
    # Combining chunks - A chunk consists of 5 images
    stack = stack.rechunk((5, 2000, 2000))
    print "After rechunking: "
    temp = stack
    #temp.visualize()
    print "Before distributing to workers:"
    print stack.mean().compute()
    print stack[1, :].compute()
    print stack[19, :].mean().compute()
    stack.visualize()

    # Distribute array components over workers and centralized scheduler
    cluster = LocalCluster()
    client = Client(cluster)
    print client

    # Load the entire distributed array on the cluster (4 workers, 4 cores)
    stack = client.persist(stack)
    #print stack.shape
    #print "After distributing to workers: "
    print stack.mean().compute()

    # map the otsu thresholding function
    #print stack[0]
    stack = da.map_blocks(otsu_thresholding,
                          stack,
                          chunks=(5, 2000, 2000),
                          dtype=sample.dtype)
    stack = da.map_blocks(blob_detection,
                          stack,
                          chunks=(5, 2000, 2000),
                          dtype=sample.dtype)
    stack = client.persist(stack)
    #th = client.persist(th)
    #thresholded.visualize()
    #th = client.persist(thresholded)
    #print thresholded.mean().compute()
    #print thresholded
    #print stack.shape
    print stack.mean().compute()
    stack.visualize()
Beispiel #3
0
def run_variant_processing_pipeline(input: str,
                                    client: Client = None
                                    ) -> Dict[str, ddf.DataFrame]:
    """
    Completely process a set of variants from a given GVF file, isolate and return
    variant metadata and effects.

    arguments
        input:  input variant filepath in GVF format
        client: dask client object

    returns
        a dict containing two keys
            dict:
                effects:  dataframe containing variant effects
                metadata: dataframe containing variant metadata
    """

    client = get_client() if client is None else client

    ## Read and process to an intermediate format
    processed_df = _process_variants(input)
    processed_df = client.persist(processed_df)

    ## Isolate variant metadata and effects
    effect_df = isolate_variant_effects(processed_df)
    meta_df = isolate_variant_metadata(processed_df)

    return {
        'effects': effect_df,
        'metadata': meta_df,
    }
Beispiel #4
0
def main():
    args = get_args()
    client = Client('127.0.0.1:8786')
    ncores = sum(client.ncores().values())
    pd.set_option('display.large_repr', 'truncate')
    pd.set_option('display.max_columns',
                  0)  # noqa pd.set_option('display.max_rows', 1000)  # noqa
    cann_group_df = make_cann_group_df(num_products=100)
    df = read_df(args, cann_group_df['productKey'])
    logger.info('Setting index')
    df = df.set_index('customerKey', drop=True)
    logger.info('Repartitioning')
    df = df.repartition(npartitions=ncores)
    logger.info('Mapping Cann Group')
    df['cannGroupKey'] = df['productKey'].map(cann_group_df['cannGroupKey'])
    logger.info('Persisting')
    df = client.persist(df)
    logger.info('Cann Groups')
    for cann_group_key in cann_group_df['cannGroupKey'].unique().tolist():
        print('Filtering Cann Group %s' % cann_group_key)
        cann_df = df[df['cannGroupKey'] == cann_group_key]
        print('This df: %s' % (len(cann_df), ))
        with Timer('%s' % (cann_group_key, )):
            calculate_switching(cann_df)
        return
Beispiel #5
0
def process_tasks(tasks: Iterable[Task],
                  proc: TaskProc,
                  client: Client,
                  sink: S3COGSink,
                  check_exists: bool = True,
                  chunked_persist: int = 0,
                  verbose: bool = True) -> Iterator[str]:

    def prep_stage(tasks: Iterable[Task],
                   proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]:
        for task in tasks:
            path = sink.uri(task)
            if check_exists:
                if sink.exists(task):
                    yield (None, task, path)
                    continue

            ds = proc(task)
            yield (ds, task, path)

    in_flight_cogs: Set[Future] = set()
    for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)):
        if ds is None:
            if verbose:
                print(f"..skipping: {path} (exists already)")
            yield path
            continue

        if chunked_persist > 0:
            assert isinstance(ds, xr.DataArray)
            ds = chunked_persist_da(ds, chunked_persist, client)
        else:
            ds = client.persist(ds, fifo_timeout='1ms')

        if len(in_flight_cogs):
            done, in_flight_cogs = drain(in_flight_cogs, 1.0)
            for r in done:
                yield r

        if isinstance(ds, xr.DataArray):
            attrs = ds.attrs.copy()
            ds = ds.to_dataset(dim='band')
            for dv in ds.data_vars.values():
                dv.attrs.update(attrs)

        cog = client.compute(sink.dump(task, ds),
                             fifo_timeout='1ms')
        rr = dask_wait(ds)
        assert len(rr.not_done) == 0
        del ds, rr
        in_flight_cogs.add(cog)

    done, _ = drain(in_flight_cogs)
    for r in done:
        yield r
def parquet_to_dask(context: MLClientCtx,
                    parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
                    inc_cols: Optional[List[str]] = None,
                    index_cols: Optional[List[str]] = None,
                    shards: int = 4,
                    threads_per: int = 4,
                    processes: bool = False,
                    memory_limit: str = '2GB',
                    persist: bool = True,
                    dask_key: str = 'my_dask_dataframe',
                    target_path: str = '') -> None:
    """Load parquet dataset into dask cluster
    
    If no cluster is found loads a new one and persist the data to it. It
    shouold not be necessary to create a new cluster when the function
    is run as a 'dask' job.
    
    :param context:         the function context
    :param parquet_url:     url of the parquet file or partitioned dataset as either
                            artifact DataItem, string, or path object (see pandas read_csv)
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param shards:          number of workers to launch
    :param threads_per:     number of threads per worker
    :param processes:       
    """
    if hasattr(context, 'dask_client'):
        context.logger.info('found cluster...')
        dask_client = context.dask_client
    else:
        context.logger.info('starting new cluster...')
        cluster = LocalCluster(n_workers=shards,
                               threads_per_worker=threads_per,
                               processes=processes,
                               memory_limit=memory_limit)
        dask_client = Client(cluster)

    context.logger.info(dask_client)

    df = dd.read_parquet(parquet_url)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client

        # share the scheduler
        filepath = os.path.join(target_path, 'scheduler.json')
        dask_client.write_scheduler_file(filepath)
        context.log_artifact('scheduler', target_path=filepath)

        print(df.head())
# - Load data from files
# - Filter data to a particular subset
# - Shuffle data to set an intelligent index
# - Several complex queries on top of this indexed data
#
# It is often ideal to load, filter, and shuffle data once and keep this result
# in memory. Afterwards, each of the several complex queries can be based off of
# this in-memory data rather than have to repeat the full load-filter-shuffle
# process each time. To do this, use the client.persist method

# We import data from S3 (or from pandas) in Xseconds. It will be a great choice
# to use client.persist method in order to avoid this execution time each time
# we want to process our data.

user_item = df_interactions[['msno', 'song_id', 'interacted']]
user_item = client.persist(user_item, retries=2)
user_item_grouped = user_item.groupby(['msno', 'song_id'])['interacted'].max()

user_item_grouped_pd = client.compute(user_item_grouped).result()
type(user_item_grouped_pd)

dtype_items = {'song_length': np.int32, 'language': np.float32}

df_items = pd.read_csv('items2.csv', dtype=dtype_items)
df_inter['date'] = 0

#MAKE RECOMMENDATIONS
rec = robo.Recommender(df_items=df_items,
                       df_reviews=df_inter,
                       user_item_grouped=user_item_grouped_pd,
                       item_name_colname='name',
Beispiel #8
0
    def image_tikhonov(self,
                       vis_arr,
                       sphere,
                       alpha,
                       scale=True,
                       usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]

        lambduh = alpha / np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("augmented: {}".format(gamma.shape))

            vis_aux = vis_to_real(vis_arr)
            logger.info("vis mean: {} shape: {}".format(
                np.mean(vis_aux), vis_aux.shape))

            tol = min(alpha / 1e4, 1e-10)
            logger.info("Solving tol={} ...".format(tol))

            # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s),
            # tol=1e-6,
            # l1_ratio = 0.01,
            # max_iter=100000,
            # positive=True)
            if False:
                (
                    sky,
                    lstop,
                    itn,
                    r1norm,
                    r2norm,
                    anorm,
                    acond,
                    arnorm,
                    xnorm,
                    var,
                ) = scipy.sparse.linalg.lsqr(gamma,
                                             vis_aux,
                                             damp=alpha,
                                             show=True)
                logger.info(
                    "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format(
                        alpha, itn, r2norm, xnorm))
            else:
                reg = linear_model.Ridge(alpha=alpha,
                                         tol=tol,
                                         solver="lsqr",
                                         max_iter=100000)

                reg.fit(gamma, vis_aux)
                logger.info("    Solve Complete, iter={}".format(reg.n_iter_))

                sky = da.from_array(reg.coef_)

                residual = vis_aux - gamma @ sky

                sky, residual_norm, solution_norm = da.compute(
                    sky,
                    np.linalg.norm(residual)**2,
                    np.linalg.norm(sky)**2)

                score = reg.score(gamma, vis_aux)
                logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format(
                    alpha, score, residual_norm, solution_norm))

        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask

            logger.info("Starting Dask Client")

            if True:
                cluster = LocalCluster(dashboard_address=":8231",
                                       processes=False)
                client = Client(cluster)
            else:
                client = Client("tcp://localhost:8786")

            logger.info("Client = {}".format(client))

            harmonic_list = []
            p2j = 2 * np.pi * 1.0j

            dl = sphere.l
            dm = sphere.m
            dn = sphere.n

            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr

            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(
                    np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) /
                    np.sqrt(sphere.npix),
                    chunks=(n_s, ),
                )
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info("Gamma Shape: {}".format(gamma.shape))
            # gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)

            logger.info("Gamma Shape: {}".format(gamma.shape))

            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real],
                                      [proj_operator_imag]])

            proj_operator = client.persist(proj_operator)

            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(
                np.array(
                    np.concatenate((np.real(vis_arr), np.imag(vis_arr))),
                    dtype=np.float32,
                ))

            # logger.info("Solving...")

            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en = dask_glm.regularizers.L2()
            # dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            # dv = da.from_array(vis_aux)

            dask.config.set({"array.chunk-size": "1024MiB"})
            A = da.rechunk(proj_operator, chunks=("auto", n_s))
            A = client.persist(A)
            y = vis_aux  # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg = LinearRegression(
                penalty=en,
                C=1.0 / lambduh,
                fit_intercept=False,
                solver="lbfgs",
                max_iter=1000,
                tol=1e-8,
            )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info("Loss function: {}".format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=False)
        return sky.reshape(-1, 1)
Beispiel #9
0
def run_complete_hg38_variant_processing_pipeline(
        indir: str = None,
        effect_dir: str = None,
        meta_dir: str = None,
        client: Client = None) -> List[Future]:
    """
    Run the variant processing pipeline step for all hg38 build chromosomes.
    Basically a wrapper for run_variant_processing_pipeline but also saves the results
    to disk.

    arguments
        indir:      directory filepath containing hg38 variant files
        effect_dir: output directory to save variant effects
        meta_dir:   output directory to save variant metadata
        client:     dask client object

    returns
        a list of futures containing dicts to effect and metadata filepaths for variants
        on each chromosome. Dicts have the following format:
            dict:
                effects:  future containing filepath to processed effects
                metadata: future containing filepath to processed metadata
    """

    ## This shouldn't be necessary since this function should never be called when
    ## a mouse genome build is specified but w/e
    globals = Globals().reinitialize(build='hg38')

    if indir is None:
        indir = globals.dir_variant_raw

    if effect_dir is None:
        effect_dir = globals.dir_variant_effects

    if meta_dir is None:
        meta_dir = globals.dir_variant_meta

    client = get_client() if client is None else client

    futures = []

    ## The input directory should have a bunch of GVF files if no custom output filenames
    ## were used
    for fp in glob(f'{indir}/*.gvf'):

        log._logger.info(f'Starting work on {fp}')

        results = run_variant_processing_pipeline(fp)

        ## Persist the processed dataframes onto the cluster workers
        effects = client.persist(results['effects'])
        metadata = client.persist(results['metadata'])

        ## Save the processed dataframes in the background
        effect_future = dfio.save_dataframe_async(
            effects,
            Path(effect_dir,
                 Path(Path(fp).stem).with_suffix('.tsv')).as_posix())

        meta_future = dfio.save_dataframe_async(
            metadata,
            Path(meta_dir,
                 Path(Path(fp).stem).with_suffix('.tsv')).as_posix())

        futures.append({'effects': effect_future, 'metadata': meta_future})

    return futures
def main(
        endpoint: str = endpoint,
        port: int = 443,
        protocol: str = "https",
        collection_name: str = collection_name,
        in_path: Path = in_path,
        api_key: str = os.getenv("TYPESENSE_API_KEY"),
        drop: bool = False,
):
    tprint("Reguleque")
    tsClient = ts.Client({
        "api_key":
        api_key
        or typer.prompt(LOG_PROMPT + " Typesense Admin API Key", type=str),
        "nodes": [{
            "host": os.getenv("TYPESENSE_HOST") or endpoint,
            "port": port,
            "protocol": protocol,
        }],
    })
    typer.echo(LOG_INFO +
               f" Connected to Typesense at {protocol}://{endpoint}:{port}")

    daskClient = Client()
    typer.echo(
        LOG_INFO +
        f" Started cluster, you can monitor at {daskClient.dashboard_link}")

    # List all the files that need loading
    filepaths = list(in_path.rglob("**/*.csv"))
    typer.secho(LOG_INFO + f" Found {len(filepaths)} files to load.")

    try:
        # Drop pre-existing collection if any
        if drop:
            confirm_drop = typer.confirm(
                LOG_WARN +
                " Are you sure you want to delete all documents in the cluster and recreate the schema?"
            )
            if not confirm_drop:
                typer.echo(LOG_ERR + " Canceling execution.", err=True)
                raise typer.Abort()
            typer.echo(
                LOG_WARN +
                " Drop mode has been enabled, dropping all documents and recreating schema...",
                err=True,
            )
            try:
                tsClient.collections[collection_name].delete()
            except Exception:
                pass
            # Create collection with the manual schema
            tsClient.collections.create(REVENUE_SCHEMA)
            typer.secho(LOG_INFO + " Created new schema.")

        # Load all files
        typer.secho(LOG_INFO + " Processing and uploading documents...")
        responses: List[List[str]] = []
        for filepath in filepaths:
            entries: List[dict] = process_file(filepath)
            response: List[str] = import_entries(entries, filepath, tsClient)
            responses.append(response)

        responses = daskClient.persist(responses)
        progress(responses)
        responses = daskClient.gather(responses)
        sleep(2)
        typer.secho(
            "\n" + LOG_INFO +
            f" Finished processing and uploading {len(filepaths)} documents.")
    except ts.exceptions.RequestUnauthorized:
        typer.echo(LOG_ERR + " Invalid API key or insufficient permissions.",
                   err=True)
        raise typer.Abort()
    start1 = t.time()

    print('cpu_count: %d' % cpu_count())

    path_fmt = '/mnt/storage-ssd/luokaida/data/rawdata/3C129_pband_target.ms'
    all_path = [path_fmt for i in range(1)]
    print('the number of ms_files: %d\n' % len(all_path))

    files = db.from_sequence(all_path)

    avg_ms = files.map(load_msdata).map(time_average,
                                        avg_time=2).map(channel_average,
                                                        avg_channel=4)
    combine_ms = avg_ms.flatten().foldby(key=lambda x: x.tag,
                                         binop=combine_msdata)
    # groups = avg_ms.flatten().groupby(grouper=lambda x: x.tag)  #
    # example_source = avg_ms.flatten().filter(lambda x: x.tag == '0_2').fold(combine_msdata)

    print('Start computation in the background....... '
          )  # construct task graphs above
    all_avg_ms = c.persist(combine_ms)  # Persist dask collections on cluster
    res = c.gather(all_avg_ms)
    wait(res)  # persist()--->gather()
    # res = avg_ms.compute()  # Don't use .compute()

    end1 = t.time()
    # combine_ms.visualize(filename='flow_chart_20.pdf')
    print('\nUsing dask-xarray:map-reduce costs %.2f seconds\n' %
          (end1 - start1))
    print()  # put a breakpoint here
Beispiel #12
0
    '3166-1-alpha-3', 'DateTime'
]

covid19_merge1 = covid19_merge1.drop(remove_cols, axis=1)
covid19_merge1 = covid19_merge1.fillna(0)

#prepared_data =  covid19_merge1.copy()
#### 5e changement : exécuté avec des "workers" qui acceptent 8Go max, il y aura une consommation excessive de la mémoire
#### On avait 146 partitions, on diminue de moitié la taille des partitions actuelles
print("Partitions avant: ", covid19_merge1.npartitions)
covid19_merge1 = covid19_merge1.repartition(
    npartitions=covid19_merge1.npartitions * 2)
print("Partitions après: ", covid19_merge1.npartitions)

#print("covid19_merge1 partition :",covid19_merge1.npartitions)
prepared_data = client.persist(covid19_merge1)

## Encode Pays
from dask_ml.preprocessing import LabelEncoder

encoded_countries = LabelEncoder().fit_transform(prepared_data.country_name)
prepared_data['country_name'] = encoded_countries

## Encode Date
dates = prepared_data.date.apply(lambda x: x.strftime('%Y%m%d'))
encoded_dates = LabelEncoder().fit_transform(dates)
prepared_data['date'] = encoded_dates

print(f"Le dataset contient {len(prepared_data)} enregistrements")
print("Sample dataset final:")
print(prepared_data.head(5))
Beispiel #13
0
def run_complete_hg38_annotation_pipeline(variant_dir: str = None,
                                          gene_fp: str = None,
                                          intergenic_dir: str = None,
                                          intragenic_dir: str = None,
                                          client: Client = None):
    """
    Run the complete annotation pipeline starting from processed hg38 gene
    and variant files. Save annotated datasets to files.

    :param client:
    :return:
    """

    if variant_dir is None:
        variant_dir = Globals().reinitialize(build='hg38').dir_variant_effects

    if gene_fp is None:
        gene_fp = Globals().reinitialize(build='hg38').fp_gene_meta

    if intergenic_dir is None:
        intergenic_dir = Globals().reinitialize(
            build='hg38').dir_annotated_inter

    if intragenic_dir is None:
        intragenic_dir = Globals().reinitialize(
            build='hg38').dir_annotated_intra

    client = get_client() if client is None else client

    ## List of Futures for annotated, intergenic, and mapping stats data
    futures = []

    ## Read/parse processed genes into a dask dataframe
    gene_df = _read_processed_genes(gene_fp)

    log._logger.info(f'Reading files in {variant_dir}')

    ## The input directory should have variant effect TSV files if no custom output
    ## filenames were used and the pipeline has used default settings to this point
    for fp in glob(f'{variant_dir}/*.tsv'):

        log._logger.info(f'Working on {fp}')
        ## Read/parse processed variants into a dask dataframe
        variant_df = _read_processed_variants(fp)

        ## Annotate
        annotations = run_annotation_pipeline(variant_df, gene_df)

        intergenic = annotations['intergenic']
        intragenic = annotations['intragenic']

        intergenic = client.persist(intergenic)
        intragenic = client.persist(intragenic)

        ## Save to files
        inter_future = dfio.save_dataframe_async(
            intergenic,
            Path(intergenic_dir,
                 Path(fp).name).as_posix())
        intra_future = dfio.save_dataframe_async(
            intragenic,
            Path(intragenic_dir,
                 Path(fp).name).as_posix())

        futures.append({
            'intergenic': inter_future,
            'intragenic': intra_future
        })

    return futures
Beispiel #14
0
for r in rows:
    for w in world:
        procs = int(math.ceil(w / TOTAL_NODES))
        print("procs per worker", procs, flush=True)

        assert procs <= 16
        
        stop_dask()
        start_dask(procs, min(w, TOTAL_NODES))
        
        client = Client("v-001:8786") 
        
        df_l = dd.read_csv(f"~/temp/twx/{scale}/{r}/{w}/csv1_*.csv").repartition(npartitions=w)
        df_r = dd.read_csv(f"~/temp/twx/{scale}/{r}/{w}/csv2_*.csv").repartition(npartitions=w)

        client.persist([df_l, df_r])

        print("left rows", len(df_l), flush=True)
        print("right rows", len(df_r), flush=True)

        try:
            for i in range(it):
                t1 = time.time()
                out = df_l.merge(df_r, on='0', how='inner', suffixes=('_left','_right')).compute()
                t2 = time.time()

                print(f"###time {r} {w} {i} {(t2 - t1)*1000:.0f}, {len(out)}", flush=True)
        
            client.restart()

            client.close()
f = h5py.File(os.path.join('data', 'random.hdf5'), mode='r')
dset = f['/x']
import dask.array as da
x = da.from_array(dset, chunks=(1000000, ))

get_ipython().magic('time x.sum().compute()')
get_ipython().magic('time x.sum().compute()')

# If, instead, we persist the data to RAM up front (this takes a few seconds to complete - we could `wait()` on this process), then further computations will be much faster.

# In[12]:

# changes x from a set of delayed prescritions
# to a set of futures pointing to data in RAM
# See this on the UI dashboard.
x = c.persist(x)

# In[13]:

get_ipython().magic('time x.sum().compute()')
get_ipython().magic('time x.sum().compute()')

# Naturally, persisting every intermediate along the way is a bad idea, because this will tend to fill up all available RAM and make the whole system slow (or break!). The ideal persist point is often at the end of a set of data cleaning steps, when the data is in a form which will get queried often.

# **Exercise**: how is the memory associated with `x` released, once we know we are done with it?

# The worker assumes that when `.compute()` is called, we no longer need the data. Thus, sending a message to RAM to clear that block in memory.

# ## Debugging

# When something goes wrong in a distributed job, it is hard to figure out what the problem was and what to do about it. When a task raises an exception, the exception will show up when that result, or other result that depend upon it, is gathered.
Beispiel #16
0
class LargeELMRegressor(_BaseELM, RegressorMixin):
    """ELM Regressor for larger-than-memory problems.

    Uses `Dask <https://dask.org>`_ for batch analysis of data in Parquet files.

    .. attention:: Why do I need Parquet files?

        Parquet files provide necessary information about the data without loading whole file content from
        disk. It makes a tremendous runtime difference compared to simpler `.csv` or `.json` file formats.
        Reading from files saves memory by loading data in small chunks, supporting arbitrary large input files.
        It also solves current memory leaks with Numpy matrix inputs in Dask.

        Any data format can be easily converted to Parquet, see `Analytical methods <techniques.html>`_ section.

        HDF5 is almost as good as Parquet, but performs worse with Dask due to internal data layout.

    .. todo: Write converters.

    .. todo: Memo about number of workers: one is good, several cover disk read latency but need more memory.
        On one machine matrix operators always run in parallel, do not benefit from Dask.

    .. todo: Memory consumption with large number of neurons - 100,000 neurons require 200GB or swap space, with
        read+write reaching 1GB/s. Suggested a fast SSD, or HDD + extra workers to hide swap latency.
        Mention that Dask is not the perfect solution, kept here for future updates. And it actually solves
        stuff larger than memory, albeit at a very high time+swap cost.

    .. todo: Avoid large batch sizes as workers can fail, safe bet is 2000-5000 range.

    .. todo: Fast HtH and in-place Cholesky solver.

    .. todo: Pro tip in documentation: run ELM with dummy 1000 data samples and 1e+9 regularization,
        This will test possible memory issues for workers without wasting your time on computing full HH.

    .. todo: Option to keep full HH permanently somewhere at disk. Saves before the final step,
        avoids failures from memory issues during Cholesky solver.

    .. todo: GPU + batch Cholesky solver, for both ELM and LargeELM.

    Requirements
    ------------
        * Pandas
        * pyarrow
        * python-snappy

    Parameters
    ----------

    batch_size : int
        Batch size used for both data samples and hidden neurons. With batch Cholesky solver, allows for very large
        numbers of hidden neurons of over 100,000; limited only by the computation time and disk swap space.

        .. hint:: Include bias and original features for best performance.

        ELM will include a bias term (1 extra feature), and the original features with `include_original_features=True`.
        For optimal performance, choose `batch_size` to be equal or evenly divide the
        `n_neurons + 1 (bias) + n_inputs (if include_original_features=True)`.

        .. todo:: Exact batch_size vs. GPU performance
    """
    def __del__(self):
        if hasattr(self, 'client_'):
            self.client_.close()
            self.cluster_.close()

    def _setup_dask_client(self):
        self.cluster_ = LocalCluster(
            n_workers=4,
            threads_per_worker=1,
            local_dir="/Users/akusok/wrkdir/dask-temp",
            memory_limit="8GB")
        self.client_ = Client(self.cluster_)

        W_list = [hl.projection_.components_ for hl in self.hidden_layers_]
        W_dask = [da.from_array(_dense(W), chunks=self.bsize_) for W in W_list]
        self.W_ = self.client_.persist(W_dask)

        def foo():
            import os
            os.environ['OMP_NUM_THREADS'] = '1'

        self.client_.run(foo)

        print("Running on:", self.client_)

        try:
            dashboard = self.client_.scheduler_info()['address'].split(":")
            dashboard[0] = "http"
            dashboard[-1] = str(
                self.client_.scheduler_info()['services']['dashboard'])
            print("Dashboard at", ":".join(dashboard))
        except:
            pass

    def _project(self, X_dask):
        """Compute hidden layer output with Dask functionality.
        """
        H_list = []
        for hl, W in zip(self.hidden_layers_, self.W_):
            if hl.hidden_layer_ == HiddenLayerType.PAIRWISE:
                H0 = X_dask.map_blocks(pairwise_distances,
                                       W,
                                       dtype=X_dask.dtype,
                                       chunks=(X_dask.chunks[0],
                                               (W.shape[0], )),
                                       metric=hl.pairwise_metric)
            else:
                XW_dask = da.dot(X_dask, W.transpose())
                if hl.ufunc_ is dummy:
                    H0 = XW_dask
                elif hl.ufunc_ is np.tanh:
                    H0 = da.tanh(XW_dask)
                else:
                    H0 = XW_dask.map_blocks(hl.ufunc_)
            H_list.append(H0)

        if self.include_original_features:
            H_list.append(X_dask)
        H_list.append(da.ones((X_dask.shape[0], 1)))

        H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_)
        return H_dask

    def _compute(self, X, y, sync_every, HH=None, HY=None):
        """Computing matrices HH and HY, the actually long part.

        .. todo: actually distributed computations that scatter batches of data file names,
            and reduce-sum the HH,HY matrices.
        """

        # processing files
        for i, X_file, y_file in zip(range(len(X)), X, y):
            X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
            Y_dask = dd.read_parquet(y_file).to_dask_array(lengths=True)
            H_dask = self._project(X_dask)

            if HH is None:  # first iteration
                HH = da.dot(H_dask.transpose(), H_dask)
                HY = da.dot(H_dask.transpose(), Y_dask)
            else:
                HH += da.dot(H_dask.transpose(), H_dask)
                HY += da.dot(H_dask.transpose(), Y_dask)
                if sync_every is not None and i % sync_every == 0:
                    wait([HH, HY])

            # synchronization
            if sync_every is not None and i % sync_every == 0:
                HH, HY = self.client_.persist([HH, HY])

        # finishing solution
        if sync_every is not None:
            wait([HH, HY])
        return HH, HY

    def _solve(self, HH, HY):
        """Compute output weights from HH and HY using Dask functionality.
        """
        # make HH/HY divisible by chunk size
        n_features, _ = HH.shape
        padding = 0
        if n_features > self.bsize_ and n_features % self.bsize_ > 0:
            print("Adjusting batch size {} to n_features {}".format(
                self.bsize_, n_features))
            padding = self.bsize_ - (n_features % self.bsize_)
            P01 = da.zeros((n_features, padding))
            P10 = da.zeros((padding, n_features))
            P11 = da.zeros((padding, padding))
            HH = da.block([[HH, P01], [P10, P11]])

            P1 = da.zeros((padding, HY.shape[1]))
            HY = da.block([[HY], [P1]])

        # rechunk, add bias, and solve
        HH = HH.rechunk(
            self.bsize_) + self.alpha * da.eye(HH.shape[1], chunks=self.bsize_)
        HY = HY.rechunk(self.bsize_)

        B = da.linalg.solve(HH, HY, sym_pos=True)
        if padding > 0:
            B = B[:n_features]

        return B

    def fit(self, X, y=None, sync_every=10):
        """Fits an ELM with data in a bunch of files.

        Model will use the set of features from the first file.
        Same features must have same names across the whole dataset.

        .. todo: Check what happens if features are in different order or missing.

        Does **not** support sparse data.

        .. todo: Check if some sparse data would work.

        .. todo: Check that sync_every does not affect results

        .. todo: Add single precision

        .. todo: Parquet file format examples in documentation

        Original features and bias are added to the end of data, for easier rechunk-merge. This way full chunks
        of hidden neuron outputs stay intact.


        Parameters
        ----------

        X : [str]
            List of input data files in Parquet format.

        y : [str]
            List of target data files in Parquet format.

        sync_every : int or None
            Synchronize computations after this many files are processed. None for running without synchronization.
            Less synchronization improves run speed with smaller data files, but may result in large swap space usage
            for large data problems. Use smaller number for more frequent synchronization if swap space
            becomes a problem.
        """

        if not _is_list_of_strings(X) or not _is_list_of_strings(y):
            raise ValueError("Expected X and y as lists of file names.")

        if len(X) != len(y):
            raise ValueError(
                "Expected X and y as lists of files with the same length. "
                "Got len(X)={} and len(y)={}".format(len(X), len(y)))

        # read first file and get parameters
        X_dask = dd.read_parquet(X[0]).to_dask_array(lengths=True)
        Y_dask = dd.read_parquet(y[0]).to_dask_array(lengths=True)

        n_samples, n_features = X_dask.shape
        if hasattr(self, 'n_features_') and self.n_features_ != n_features:
            raise ValueError(
                'Shape of input is different from what was seen in `fit`')

        _, n_outputs = Y_dask.shape
        if hasattr(self, 'n_outputs_') and self.n_outputs_ != n_outputs:
            raise ValueError(
                'Shape of outputs is different from what was seen in `fit`')

        # set batch size, default is bsize=2000 or all-at-once with less than 10_000 samples
        self.bsize_ = self.batch_size
        if self.bsize_ is None:
            self.bsize_ = n_samples if n_samples < 10 * 1000 else 2000

        # init model if not fit yet
        if not hasattr(self, 'hidden_layers_'):
            self.n_features_ = n_features
            self.n_outputs_ = n_outputs

            X_sample = X_dask[:10].compute()
            self._init_hidden_layers(X_sample)
            self._setup_dask_client()

        HH, HY = self._compute(X, y, sync_every=sync_every)
        self.B = self._solve(HH, HY)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        """Prediction works with both lists of Parquet files and numeric arrays.

        Parameters
        ----------

        X : array-like, [str]
            Input data as list of Parquet files, or as a numeric array.

        Returns
        -------
        Yh : array, shape (n_samples, n_outputs)
            Predicted values for all input samples.

            .. attention:: Returns all outputs as a single in-memory array!

                Danger of running out out memory for high-dimensional outputs, if a large set of input
                files is provided. Feed data in smaller batches in such case.
        """
        check_is_fitted(self, 'is_fitted_')

        if _is_list_of_strings(X):
            Yh_list = []

            # processing files
            for X_file in X:
                X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
                H_dask = self._project(X_dask)
                Yh_list.append(da.dot(H_dask, self.B))

            Yh_dask = da.concatenate(Yh_list, axis=0)
            return Yh_dask.compute()

        else:
            X = check_array(X, accept_sparse=True)
            H = [np.ones((X.shape[0], 1))]
            if self.include_original_features:
                H.append(_dense(X))
            H.extend([hl.transform(X) for hl in self.hidden_layers_])

            return np.hstack(H) @ self.B.compute()
Beispiel #17
0
    lon=da.ravel(lon)

    lon=lon.astype(int)
    lat=lat.astype(int)
    cm=cm.astype(int)

    Lat=lat.to_dask_dataframe()
    Lon=lon.to_dask_dataframe()
    CM=cm.to_dask_dataframe()

    df=dd.concat([Lat,Lon,CM],axis=1,interleave_partitions=False)

    cols = {0:'Latitude',1:'Longitude',2:'CM'}
    df = df.rename(columns=cols)
    
    df=client.persist(df)
    df2=delayed(df.groupby(['Longitude','Latitude']).CM.apply(countzero).reset_index())
    df3=df2.compute()
    #print(gc.collect())
    #print(df2)
    #df3=client.compute(df2)
    #print(df3)
    #tt=client.gather(df3)
    #print(tt)
    client.close()

    combs=[]
    for x in range(-89,91):
        for y in range(-179,181):
            combs.append((x, y))
        
#Filter nodes and edges to keep only papers from 2015 onwards
papers = dd.read_csv('table_a01_articles.csv')
papers = papers[papers['Journal_JournalIssue_PubDate_Year'] > 2015]
papers = papers[['PMID']]
papers['PMID'] = papers['PMID'].astype(str)
papers = papers.compute()

edges = dd.read_csv('table_a14_reference_list.csv',
                    low_memory=False,
                    blocksize=128000000,
                    dtype={'RefArticleId': 'object'})  #pendiente: subset
edges = edges.fillna(-1)
edges['PMID'] = edges['PMID'].astype(str)
edges['RefArticleId'] = edges['RefArticleId'].astype(str)
edges = client.persist(edges)

edges_ij_pmid = dd.merge(edges, papers, on=["PMID"]).compute()

papers.columns = ['RefArticleId']
edges_final = dd.merge(edges_ij_pmid, papers, on=['RefArticleId'])
papers.columns = ['PMID']

#Paper nodes
papers = dd.read_csv('table_a01_articles.csv')
papers = papers[papers['Journal_JournalIssue_PubDate_Year'] > 2015]
papers['PMID'] = papers['PMID'].astype(str)
papers = papers.compute()
papers = papers.sort_values(axis=0, by='PMID', ascending=True)

#Reindexing dictionary
Beispiel #19
0
    def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]
        
        lambduh = alpha/np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("Building Augmented Operator...")
            proj_operator_real = np.real(gamma).astype(np.float32)
            proj_operator_imag = np.imag(gamma).astype(np.float32)
            gamma = None
            proj_operator = np.block([[proj_operator_real], [proj_operator_imag]])
            proj_operator_real = None
            proj_operator_imag = None 
            logger.info('augmented: {}'.format(proj_operator.shape))
            
            vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)
            logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape))

            logger.info("Solving...")
            reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True)
            reg.fit(proj_operator, vis_aux)
            sky = reg.coef_
            
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score))
            
        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            import dask.array as da
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask
            
            logger.info('Starting Dask Client')
            
            if True:
                cluster = LocalCluster(dashboard_address=':8231', processes=False)
                client = Client(cluster)
            else:
                client = Client('tcp://localhost:8786')
                
            logger.info("Client = {}".format(client))
            
            harmonic_list = []
            p2j = 2*np.pi*1.0j
            
            dl = sphere.l
            dm = sphere.m
            dn = sphere.n
        
            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr
        
            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,))
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            #gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)
            
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            
            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real], [proj_operator_imag]])
            
            proj_operator = client.persist(proj_operator)
            
            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32))
            
            #logger.info("Solving...")

            
            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en =  dask_glm.regularizers.L2()
            #dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            #dv = da.from_array(vis_aux)
            

            dask.config.set({'array.chunk-size': '1024MiB'})
            A = da.rechunk(proj_operator, chunks=('auto', n_s))
            A = client.persist(A)
            y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg =  LinearRegression(penalty=en, C=1.0/lambduh,  
                                    fit_intercept=False, 
                                    solver='lbfgs', 
                                    max_iter=1000, tol=1e-8 )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=True)
        return sky.reshape(-1,1)
base_model = dxgb.XGBRegressor(objective='reg:squarederror',
                               tree_method='hist',
                               verbosity=3,
                               n_jobs=-1,
                               n_estimators=1000,
                               learning_rate=0.010,
                               max_depth=0,
                               max_leaves=4,
                               grow_policy='lossguide')

with joblib.parallel_backend('dask'):
    base_model.fit(X_train, y_train.flatten())
#base_model.save_model('base_line_no_max_deph_lr_%f_%i.model'%(lr,leaves))
#
predictions = base_model.predict(X_test)
predictions = client.persist(predictions)
#
#print ("########")
#print ("R^2:",r2_score(y_test.compute(), predictions.compute()))
#print ("MAE:",mean_absolute_error(y_test.compute(), predictions.compute()))
#print ("MSE:",mean_squared_error(y_test.compute(), predictions.compute()))

p = predictions.to_dask_dataframe(columns=rounds[choice]["test"])
p.to_csv("my_result_for_%s_SubCh2" % (rounds[choice]["test"][0]))

#parameters_for_testing = {
#    'colsample_bytree':[0.4,0.6,0.8],
#    'gamma':[0,0.03,0.1,0.3],
#    'min_child_weight':[1.5,6,10],
#    'learning_rate':[0.1,0.07],
#    'max_depth':[3,5],
Beispiel #21
0
# path
datadir = getDirectory("/data")
# get all
filenames = getFiles("*listings.csv.gz", datadir)

for fname in filenames:

    try:
        # read csv
        df = dd.read_csv(fname,
                         dtype=dtypes,
                         compression="gzip",
                         engine="python",
                         encoding='utf-8',
                         assume_missing=True,
                         sample=1024 * 1024,
                         error_bad_lines=False)
        # select needed feilds
        data = df[selected]
        # load df in RAM
        data = client.persist(data)
        #- Write to csv, replace null by nan, no index
        data.repartition(npartitions=1).to_csv(str(fname) + '_*.csv',
                                               na_rep='nan',
                                               index=False)

    except Exception as e:
        print(e)

print("Done")
Beispiel #22
0
class LightGBMDaskLocal:
    # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb
    """
    persist call: data = self.client.persist(data)
    (assignment replaces old lazy array, as persist does not change the
    input in-place)

    To reduce the risk of hitting memory limits,
    consider restarting each worker process before running any data loading or training code.
    self.client.restart()
        - This function will restart each of the worker processes, clearing out anything
        they’re holding in memory. This function does NOT restart the actual machines of
        your cluster, so it runs very quickly.
        - should the workers just be killed regardless of whether the whole process
        was successful or unsuccessful (sort of a clean up action)? can restarting
        be that cleanup action?

    loop over hyperparameter values (method that accepts hyperparameters as a dictionary -
        initializes self.model = DaskLGBMRegressor() with each set of parameters and
        calls the method that loops over )
    loop over train-valdation sets
    run model's fit method and compute predicted values and RMSE
    """
    def __init__(
        self,
        curr_dt_time,
        n_workers,
        s3_path,
        startmonth,
        n_months_in_first_train_set,
        n_months_in_val_set,
        frac=None,
    ):
        self.curr_dt_time = curr_dt_time
        self.startmonth = startmonth
        self.n_months_in_first_train_set = n_months_in_first_train_set
        self.n_months_in_val_set = n_months_in_val_set
        self.frac = frac if frac is not None else 1.0

        cluster = LocalCluster(n_workers=n_workers)
        self.client = Client(cluster)
        self.client.wait_for_workers(n_workers)
        print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}")
        # self.pca_transformed = ___ # call PCA code that returns numpy array here
        # (rename self.pca_transformed to self.full_dataset)
        # numpy array can also be created from the saved (pickle) file

        # for data:
        # instead of first looping over hyperparameter values and then over different
        # train-validation sets, is it better to do it in the opposite order
        # to allow for one set of train-validation data to be created only once?

        try:
            # this commented out code did not work without the meta= argument,
            # meta= was not tried as it needs all other columns listed, in
            # addition to the ones being recast
            # self.full_dataset = self.client.persist(
            #     dd.read_parquet(
            #         s3_path, index=False, engine="pyarrow"
            #     )
            #     .sample(frac=self.frac, random_state=42)
            #     .map_partitions(
            #         self.cast_types,
            #         meta={
            #             'sid_shop_item_qty_sold_day': 'i2',
            #             **{f'cat{n}': 'i2' for n in range(1,23)}
            #         }
            #     )
            #     .map_partitions(self.drop_neg_qty_sold)
            #     .set_index(
            #         "sale_date", sorted=False, npartitions="auto"
            #     )
            #     .repartition(partition_size="100MB")
            # )

            # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster
            self.full_dataset = dd.read_parquet(s3_path,
                                                index=False,
                                                engine="pyarrow").sample(
                                                    frac=self.frac,
                                                    random_state=42)
            self.full_dataset["sale_date"] = self.full_dataset[
                "sale_date"].astype("datetime64[ns]")
            self.full_dataset[
                "sid_shop_item_qty_sold_day"] = self.full_dataset[
                    "sid_shop_item_qty_sold_day"].astype("int16")
            for col in self.full_dataset:
                if col.startswith("cat"):
                    self.full_dataset[col] = self.full_dataset[col].astype(
                        "int16")

            logging.debug(
                f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}"
            )
            self.full_dataset = self.full_dataset[
                self.full_dataset.sid_shop_item_qty_sold_day >= 0]
            # call dataframe.set_index(), then repartition, then persist
            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html
            # set_index(sorted=False, npartitions='auto')
            # df = df.repartition(npartitions=df.npartitions // 100)

            # self.full_dataset = self.client.persist(self.full_dataset)
            # _ = wait([self.full_dataset])

            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html
            # self.full_dataset = self.full_dataset.repartition(partition_size="100MB")
            self.full_dataset = self.full_dataset.set_index(
                "sale_date",
                sorted=False,
                npartitions="auto",
                partition_size=100_000_000,
            )
            # partition_size for set_index: int, optional, desired size of
            # eaach partition in bytes (to be used with npartitions='auto')

            self.full_dataset = self.cull_empty_partitions(self.full_dataset)

            self.full_dataset = self.client.persist(self.full_dataset)
            _ = wait([self.full_dataset])
            logging.debug(
                f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}"
            )
            logging.debug(
                f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}"
            )
            logging.debug(
                f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}"
            )

        except Exception:
            logging.exception(
                "Exception occurred while creating Dask dataframe and persisting it on the cluster."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

        # finally:
        #     self.client.restart()
        #     sys.exit(1)

        # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask
        # Parquet datasets can be saved into separate files.
        # Each file may contain separate row groups.
        # Dask Dataframe reads each Parquet row group into a separate partition.

        # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE
        # DELETED AFTER DASK ARRAY IS CREATED
        # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID
        # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY
        # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION
        # SETS, SO WHAT'S THE BEST WAY TO DO THAT?
        # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR
        # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE?
        # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES -
        # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME
        # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL
        # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW
        # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT?
        # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE
        # https://github.com/dask/distributed/issues/1676 -
        # "You should also be aware that the task/data model underlying dask
        # arrays is immutable. You should never try to modify memory in-place.")
        # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING,
        # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION

        # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED /
        # FROM_DELAYED
        # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array)

        # can I use a function to read multiple files into one Dask array?

        # either figure out how to read multiple files (saved on S3) into one
        # Dask array, or
        # figure out how to save one array of PCA results to S3 (need disk space
        # to save it locally before transfer to S3 and need a method that can
        # handle transfer of more than 5GB - multipart transfer to S3)

        # try to write PCA-transformed data directly to zarr array (stored in memory)
        # then upload it to S3 (directly from memory)
        # then create dask array from that zarr array in S3

        # try to write PCA-transformed data to xarray then upload it to S3 as zarr

        # save numpy array to parquet file, upload that file to S3 (using upload_file),
        # then read that file into a Dask dataframe
        # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library?
        # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb)
        # df = dd.read_parquet('s3://bucket/my-parquet-data')
        # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet)
        # from above link:
        # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used.
        # read partitioned parquet dataset with Dask:
        # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset

    # def cast_types(self, df):
    #     df = df.copy()
    #     df['sale_date'] = df["sale_date"].astype(
    #         "datetime64[ns]"
    #     )
    #     for col in df:
    #         if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"):
    #             df[col] = df[col].astype("int16")
    #     return df
    #
    # def drop_neg_qty_sold(self, df):
    #     return df[df.sid_shop_item_qty_sold_day >= 0].copy()

    # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
    def cull_empty_partitions(self, ddf):
        ll = list(ddf.map_partitions(len).compute())
        ddf_delayed = ddf.to_delayed()
        ddf_delayed_new = list()
        pempty = None
        for ix, n in enumerate(ll):
            if 0 == n:
                pempty = ddf.get_partition(ix)
            else:
                ddf_delayed_new.append(ddf_delayed[ix])
        if pempty is not None:
            ddf = dd.from_delayed(ddf_delayed_new, meta=pempty)
        return ddf

    def gridsearch_wfv(self, params):
        # self.hyperparameters = hyperparameters
        # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in
        # the self.hyper_dict dictionary with value containing list of RMSE values
        self.all_params_combs = list()
        # determine if there is more than one combination of hyperparameters
        # if only one combination, set get_stats_ flag to True
        self.get_stats_ = (len(params[max(params,
                                          key=lambda x: len(params[x]))]) == 1)
        for params_comb_dict in (dict(
                zip(params.keys(),
                    v)) for v in list(product(*list(params.values())))):
            # for self.hyper_dict in hyperparameters:
            # self.params_combs_list.append(params_comb_dict)
            self.params_comb_dict = params_comb_dict.copy()
            self.params_comb_dict["rmse_list_"] = list()
            self.params_comb_dict["monthly_rmse_list_"] = list()
            self.params_comb_dict["fit_times_list_"] = list()
            try:
                self.model = lgb.DaskLGBMRegressor(
                    client=self.client,
                    random_state=42,
                    silent=False,
                    tree_learner="data",
                    force_row_wise=True,
                    **params_comb_dict,
                )
            except Exception:
                logging.exception(
                    "Exception occurred while initializing Dask model.")
                # kill all active work, delete all data on the network, and restart the worker processes.
                self.client.restart()
                sys.exit(1)

            # call method that loops over train-validation sets
            with performance_report(
                    filename=f"dask_report_{self.curr_dt_time}.html"):
                for train, test, get_stats in self.train_test_time_split():
                    self.fit(train).predict(test).rmse_all_folds(
                        test, get_stats)

            self.params_comb_dict["avg_rmse_"] = mean(
                self.params_comb_dict["rmse_list_"])
            self.params_comb_dict["monthly_avg_rmse_"] = mean(
                self.params_comb_dict["monthly_rmse_list_"])
            self.all_params_combs.append(self.params_comb_dict)

        best_params = min(self.all_params_combs,
                          key=lambda x: x["monthly_avg_rmse_"])
        self.best_score_ = best_params["monthly_avg_rmse_"]
        # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.)
        self.best_params_ = {
            k: v
            for k, v in best_params.items() if k in params
        }

        # save list of parameter-result dictionaries to dataframe and then to CSV
        if self.all_params_combs:
            all_params_combs_df = pd.DataFrame(self.all_params_combs)
            output_csv = "all_params_combs.csv"
            all_params_combs_df.to_csv(output_csv, index=False)

            try:
                key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv"
                # global s3_client
                s3_client = boto3.client("s3")
                response = s3_client.upload_file(output_csv,
                                                 "sales-demand-data", key)
                logging.info(
                    "Name of CSV uploaded to S3 and containing all parameter combinations "
                    f"and results is: {key}")
            except ClientError as e:
                logging.exception(
                    "CSV file with LightGBM parameter combinations and results was not copied to S3."
                )

        else:
            logging.debug(
                "List of parameter-result dictionaries is empty and was not converted to CSV!"
            )

            # probably do the opposite:
            # loop over train-validation splits (persisting that data in memory)
            # and run different models on one
            # split, saving the results that can later be aggregated

            # is it possible to read the full range of dates needed for time
            # series validation and then drop/delete rows from array or
            # move some rows to another array:
            # start with July-September (train) + October (validation),
            # then remove October and move September from train to validation

    # def time_split(self):
    #     return (
    #         self.full_dataset.loc[:self.end_date],
    #         self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)]
    #         # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)]
    #         # less than or equal to last day of month currently used for validation
    #     )

    def train_test_time_split(self):
        # first (earliest) month: July 2015
        # number of months in first train set: 1
        # number of months in validation set: 2
        #
        # number of months between Oct 2015 and July 2015: 3
        # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval)
        # (where 2 is the number of months in validation set)

        # (3 - n_months_in_first_train_set + 1) - (2 - 1)
        n_val_sets = (
            month_counter(
                self.startmonth)  # self.startmonth is e.g. July 1, 2015
            - self.n_months_in_first_train_set +
            1) - (self.n_months_in_val_set - 1)

        for m in range(n_val_sets):
            end_date = self.startmonth + relativedelta(
                months=m + self.n_months_in_first_train_set - 1, day=31)
            if self.get_stats_:
                get_stats = m == n_val_sets - 1
            else:
                get_stats = False
            yield (self.full_dataset.loc[:end_date], self.full_dataset.
                   loc[end_date + timedelta(days=1):end_date +
                       relativedelta(months=self.n_months_in_val_set, day=31)],
                   get_stats)
            # self.train, self.test = self.time_split(self.full_dataset, self.end_date)

    def get_sample_weights(self, train):
        weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array(
            lengths=True).astype('float32')
        weights_arr = da.where(weights_arr == 0,
                               self.params_comb_dict['weight_for_zeros'], 1.)
        return weights_arr

    def fit(self, train):
        try:
            start_time = time.perf_counter()
            logging.debug(
                f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}"
            )
            logging.debug(
                f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}")
            self.model.fit(
                train[[col for col in train if col.startswith(("pc", "cat"))
                       ]].to_dask_array(lengths=True),
                train["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True),
                sample_weight=self.get_sample_weights(train),
                feature_name=[
                    col for col in train if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in train if col.startswith("cat")
                ],
            )
            assert self.model.fitted_
            self.params_comb_dict["fit_times_list_"].append(
                time.perf_counter() - start_time)

            return self

        except Exception:
            logging.exception(
                "Exception occurred while fitting model on train data during walk-forward validation."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def predict(self, test):
        try:
            self.y_pred = self.model.predict(
                test[[col for col in test if col.startswith(("pc", "cat"))]])
            return self
        except Exception:
            logging.exception(
                "Exception occurred while computing predicted values on the test data."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def rmse_all_folds(self, test, get_stats):
        try:
            # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}")
            # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}")
            # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}")
            # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}")
            self.params_comb_dict["rmse_list_"].append(
                calc_rmse(
                    test["sid_shop_item_qty_sold_day"].to_dask_array(
                        lengths=True),
                    self.y_pred.compute_chunk_sizes(),
                    get_stats,
                ))
            # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred))

            self.params_comb_dict["monthly_rmse_list_"].append(
                calc_monthly_rmse(
                    test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]],
                    self.y_pred,
                ))

        except Exception:
            logging.exception(
                "Exception occurred while computing RMSE on the test data.")
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def refit_and_save(self, model_path):
        """
        https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076
        """
        try:
            self.best_model = lgb.DaskLGBMRegressor(
                client=self.client,
                random_state=42,
                silent=False,
                tree_learner="data",
                force_row_wise=True,
                **self.best_params_,
            )
            self.best_model.fit(
                self.full_dataset[[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ]].to_dask_array(lengths=True),
                self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True, ),
                sample_weight=self.get_sample_weights(self.full_dataset),
                feature_name=[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in self.full_dataset if col.startswith("cat")
                ],
            )
            output_txt = str(model_path).split("/")[-1]
            booster = self.best_model.booster_.save_model(output_txt)

            # output_txt = str(model_path).split('/')[-1]
            # global s3_client
            s3_client = boto3.client("s3")
            response = s3_client.upload_file(output_txt, "sales-demand-data",
                                             output_txt)
            logging.info(
                f"Name of saved model uploaded to S3 is: {output_txt}")

        except (Exception, ClientError):
            logging.exception(
                "Exception occurred while fitting model on the full dataset and saving the booster to file on S3."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)
Beispiel #23
0
    # Trim the input to a bit larger than the target period for the rolling
    # average, making sure we have full days
    ds = ds.sel(time=slice('19791201', '20100131T2300'))

    print("Analysing %.2f GB" % (ds.mx2t.nbytes / (1024**3)))

    # Pre-process the input timeseries, then trim to the target date range
    rolled = rolling_maximum(ds.mx2t).sel(time=slice('19800101', '20100101'))

    # Run a percentile on each day of the year
    doy_p90 = (rolled.groupby('time.dayofyear').reduce(dask_percentile,
                                                       dim='time',
                                                       q=90,
                                                       allow_lazy=True))

    # Convert to a Dataset and save the output
    doy_p90 = doy_p90.to_dataset(name='mx2t_doy_p90')
    future = client.persist(doy_p90.to_netcdf('mx2t_doy_p90.nc',
                                              compute=False))

    # Uncomment for a progress bar:
    # progress(future)
    future.compute()

    end = time.perf_counter()
    print()
    print("time", end - start)

    client.close()
from dask.datasets import timeseries
import time
from dask.dataframe.shuffle import shuffle
from dask.distributed import Client, wait

if __name__ == "__main__":
    client = Client("127.0.0.1:8786")
    ddf_h = timeseries(start='2000-01-01', end='2000-01-02', partition_freq='1min')
    result = shuffle(ddf_h, "id", shuffle="tasks")
    ddf = client.persist(result)
    _ = wait(ddf)
    client.shutdown()
    time.sleep(0.5)
def main(args):
    """
    Initialising launch sequence.
    """
    # ------------------------------------------------------
    # Print some stuff to show that the code is running:
    print("")
    os.system(
        "printf 'A demonstration of a \033[5mDPrepB/DPrepC\033[m SDP pipeline\n'"
    )
    print("")
    # Set the directory for the moment images:
    MOMENTS_DIR = args.outputs + '/MOMENTS'
    # Check that the output directories exist, if not then create:
    os.makedirs(args.outputs, exist_ok=True)
    os.makedirs(MOMENTS_DIR, exist_ok=True)
    # Set the polarisation definition of the instrument:
    POLDEF = init_inst(args.inst)

    # Setup Variables for SIP services
    # ------------------------------------------------------
    # Define the Queue Producer settings:
    if args.queues:
        queue_settings = {
            'bootstrap.servers': 'scheduler:9092',
            'message.max.bytes': 100000000
        }  #10.60.253.31:9092

    # Setup the Confluent Kafka Queue
    # ------------------------------------------------------
    if args.queues:
        from confluent_kafka import Producer
        import pickle
        # Create an SDP queue:
        sip_queue = Producer(queue_settings)

    # Define a Data Array Format
    # ------------------------------------------------------
    def gen_data(channel):
        return np.array([
            vis1[channel], vis2[channel], channel, None, None, False, False,
            args.plots,
            float(args.uvcut),
            float(args.pixels), POLDEF, args.outputs,
            float(args.angres), None, None, None, None, None, None, args.twod,
            npixel_advice, cell_advice
        ])

    # Setup the Dask Cluster
    # ------------------------------------------------------
    starttime = t.time()

    dask.config.set(get=dask.distributed.Client.get)
    client = Client(
        args.daskaddress)  # scheduler for Docker container, localhost for P3.

    print("Dask Client details:")
    print(client)
    print("")

    # Define channel range for 1 subband, each containing 40 channels:
    channel_range = np.array(range(int(args.channels)))

    # Load the data into memory:
    """
    The input data should be interfaced with Buffer Management.
    """
    print("Loading data:")
    print("")
    vis1 = [
        load('%s/%s' % (args.inputs, args.ms1), range(channel, channel + 1),
             POLDEF) for channel in range(0, int(args.channels))
    ]
    vis2 = [
        load('%s/%s' % (args.inputs, args.ms2), range(channel, channel + 1),
             POLDEF) for channel in range(0, int(args.channels))
    ]

    # Prepare Measurement Set
    # ------------------------------------------------------
    # Combine MSSS snapshots:
    vis_advice = append_visibility(vis1[0], vis2[0])

    # Apply a uv-distance cut to the data:
    vis_advice = uv_cut(vis_advice, float(args.uvcut))
    npixel_advice, cell_advice = uv_advice(vis_advice, float(args.uvcut),
                                           float(args.pixels))

    # Begin imaging via the Dask cluster
    # ------------------------------------------------------
    # Submit data for each channel to the client, and return an image:

    # Scatter all the data in advance to all the workers:
    """
    The data here could be passed via Data Queues.
    Queues may not be ideal. Data throughput challenges.
    Need to think more about the optimum approach.
    """
    print("Scatter data to workers:")
    print("")
    big_job = [client.scatter(gen_data(channel)) for channel in channel_range]

    # Submit jobs to the cluster and create a list of futures:
    futures = [
        client.submit(dprepb_imaging, big_job[channel], pure=False, retries=3)
        for channel in channel_range
    ]
    """
    The dprepb_imaging function could generate QA, logging, and pass this information via Data Queues.
    Queues work well for this.
    Python logging calls are preferable. Send them to a text file on the node.
    Run another service that watches that file. Or just read from standard out.
    The Dockerisation will assist with logs.
    """

    print("Imaging on workers:")
    # Watch progress:
    progress(futures)

    # Wait until all futures are complete:
    wait(futures)

    # Check that no futures have errors, if so resubmit:
    for future in futures:
        if future.status == 'error':
            print("ERROR: Future", future, "has 'error' status, as:")
            print(client.recreate_error_locally(future))
            print("Rerunning...")
            print("")
            index = futures.index(future)
            futures[index].cancel()
            futures[index] = client.submit(dprepb_imaging,
                                           big_job[index],
                                           pure=False,
                                           retries=3)

    # Wait until all futures are complete:
    wait(futures)

    # Gather results from the futures:
    results = client.gather(futures, errors='raise')

    # Run QA on ARL objects and produce to queue:
    if args.queues:
        print("Adding QA to queue:")
        for result in results:
            sip_queue.produce('qa', pickle.dumps(qa_image(result), protocol=2))

        sip_queue.flush()

    # Return the data element of each ARL object, as a Dask future:
    futures = [
        client.submit(arl_data_future, result, pure=False, retries=3)
        for result in results
    ]

    progress(futures)

    wait(futures)

    # Calculate the Moment images
    # ------------------------------------------------------
    # Now use 'distributed Dask arrays' in order to parallelise the Moment image calculation:
    # Construct a small Dask array for every future:
    print("")
    print("Calculating Moment images:")
    print("")
    arrays = [
        da.from_delayed(future,
                        dtype=np.dtype('float64'),
                        shape=(1, 4, 512, 512)) for future in futures
    ]

    # Stack all small Dask arrays into one:
    stack = da.stack(arrays, axis=0)

    # Combine chunks to reduce overhead - is initially (40, 1, 4, 512, 512):
    stack = stack.rechunk((1, 1, 4, 64, 64))

    # Spread the data around on the cluster:
    stack = client.persist(stack)
    # Data is now coordinated by the single logical Dask array, 'stack'.

    # Save the Moment images:
    """
    The output moment images should be interfaced with Buffer Management.
    
    Need to know more about the Buffer specification.
    Related to initial data distribution also/staging.
    """
    print("Saving Moment images to disk:")
    print("")
    # First generate a template:
    image_template = import_image_from_fits('%s/imaging_dirty_WStack-%s.fits' %
                                            (args.outputs, 0))

    # Output mean images:
    # I:
    image_template.data = stack[:, :, 0, :, :].mean(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template,
                         '%s/Mean-%s.fits' % (MOMENTS_DIR, 'I'))

    # Q:
    image_template.data = stack[:, :, 1, :, :].mean(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template,
                         '%s/Mean-%s.fits' % (MOMENTS_DIR, 'Q'))

    # U:
    image_template.data = stack[:, :, 2, :, :].mean(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template,
                         '%s/Mean-%s.fits' % (MOMENTS_DIR, 'U'))

    # P:
    image_template.data = da.sqrt(
        (da.square(stack[:, :, 1, :, :]) +
         da.square(stack[:, :, 2, :, :]))).mean(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template,
                         '%s/Mean-%s.fits' % (MOMENTS_DIR, 'P'))

    # Output standard deviation images:
    # I:
    image_template.data = stack[:, :, 0, :, :].std(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'I'))

    # Q:
    image_template.data = stack[:, :, 1, :, :].std(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'Q'))

    # U:
    image_template.data = stack[:, :, 2, :, :].std(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'U'))

    # P:
    image_template.data = da.sqrt(
        (da.square(stack[:, :, 1, :, :]) +
         da.square(stack[:, :, 2, :, :]))).std(axis=0).compute()
    # Run QA on ARL objects and produce to queue:
    if args.queues:
        sip_queue.produce('qa',
                          pickle.dumps(qa_image(image_template), protocol=2))
    # Export the data to disk:
    export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'P'))

    # Flush queue:
    if args.queues:
        sip_queue.flush()

    # Make a tarball of moment images:
    subprocess.call([
        'tar', '-cvf',
        '%s/moment.tar' % (MOMENTS_DIR),
        '%s/' % (MOMENTS_DIR)
    ])
    subprocess.call(['gzip', '-9f', '%s/moment.tar' % (MOMENTS_DIR)])

    endtime = t.time()
    print(endtime - starttime)
# Remove the now redundant `day_of_year` and `time_of_day` columns:

uoreg_df = uoreg_df.drop(['day_of_year', 'time_of_day'], axis=1)
uoreg_df.head()

# Sort by the timestamp:

uoreg_df = uoreg_df.set_index('ts')
uoreg_df.head()

uoreg_df.tail()

uoreg_df.visualize()

# Save current dataframe in memory to avoid accumulating several operations on the dask graph
uoreg_df = client.persist(uoreg_df)

uoreg_df.visualize()

# ## Exploring ambient temperature data
#
# Plotting the ambient temperature data to get an overview of it, try to visually identify possible outliers and check the points highlighted from each quality control flag value

data = [
    go.Scatter(x=uoreg_df.index.compute(), y=uoreg_df.ambient_temperature_1)
]
layout = go.Layout(title='Ambient temperature 1')
fig = go.FigureWidget(data, layout)
fig

uoreg_df.qlt_ctrl_flag_1.unique().compute()
Beispiel #27
0
def main():

    client = Client(processes=False)

    # read dask.DataFrame with extracted data
    df = dd.read_parquet('data/interim/NWATL21_subset')

    # remove data with Med_depth < -10000
    df = df.loc[df['Med_depth'] > -1e4]

    # remove data with certain Group_Id which have been identified to have
    # inconsistent depth values
    df = df.loc[(df['Group_Id'] != 'ect18-38')      &
                (df['Group_Id'] != 'ch036l01')      &
                (df['Group_Id'] != 'c2207')         &
                (df['Group_Id'] != 'p885ns')        &
                (df['Group_Id'] != 'a2091l01')      &
                (df['Group_Id'] != 'kn151l4')       &
                (df['Group_Id'] != 'KJACK2006')     &
                (df['Group_Id'] != 'BROWNSBANK1996')]

    # make sure all depths are negative
    # df['Med_depth'] = -1 * df['Med_depth'].abs()

    # compute up to here, keep results in memory
    df = client.persist(df)

    # Some observations are referenced to LLWLT and need to be corrected.
    # Therefore, the dataset is split up into two subsets for further
    # processing. It is further assumed that the tidal correction does not
    # affect the accuracy for observations with Med_depth < -200.

    # only select coordinates and Med_depth
    outcols = ['Lon', 'Lat', 'Med_depth']

    # observations relative to MSL or MWL - no correction needed.
    msldf = df.loc[(df['Vertical_ref'] == 'MSL:2005')         |
                   (df['Vertical_ref'] == 'MSL:2006')         |
                   (df['Vertical_ref'] == 'MWL:2006')         |
                   (((df['Vertical_ref'] == 'LLWLT:2005')     |
                     (df['Vertical_ref'] == 'LLWLT:2006')     |
                     (df['Vertical_ref'] == 'VER_DAT:LLWLT')) &
                     (df['Med_depth'] < -200)), outcols].compute()

    # write output files
    store = pd.HDFStore('data/interim/NWATL21_subset_msl.h5')
    store.put('df', msldf, data_columns=msldf.columns)
    store.close()

    del msldf

    # observations relative to LLWLT - correction needed.
    llwltdf = df.loc[((df['Vertical_ref'] == 'LLWLT:2005')     |
                      (df['Vertical_ref'] == 'LLWLT:2006')     |
                      (df['Vertical_ref'] == 'VER_DAT:LLWLT')) &
                     (df['Med_depth'] >= -200), outcols].compute()

    # write output files
    store = pd.HDFStore('data/interim/NWATL21_subset_llwlt.h5')
    store.put('df', llwltdf, data_columns=llwltdf.columns)
    store.close()

    del llwltdf