Ejemplo n.º 1
0
def main():
    arguments = docopt(__doc__)
    s3_canonical_bucket = arguments["--canonical-bucket"]
    local_dirs = arguments["--local-dirs"].split()
    thres = float(arguments["--thres"])
    output_dir = arguments["--output-dir"]
    workers = int(arguments["--workers"]) if arguments["--workers"] else 8

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    try:
        # create a dask client
        dask_client = Client(n_workers=workers)

        # NB here we check that scheduler and workers do have the same
        # versions of the various libraries, as mismatches may cause
        # exceptions and weird behaviours.
        libraries_versions = dask_client.get_versions(check=True)
        logging.info(dask_client)

        run_checks_imported_issues(s3_bucket=s3_canonical_bucket,
                                   local_dirs=local_dirs,
                                   output_dir=output_dir,
                                   thres=thres)

    except Exception as e:
        raise e
Ejemplo n.º 2
0
 def __engine_init(self):
     if self.local:
         print("Staring local cluster with {} workers".format(
             self.n_workers))
         cluster = LocalCluster(
             n_workers=self.n_workers)  # TODO: add more arguments
         print("Cluster settings:")
         print(cluster)
         client = Client(cluster)
         print("Client settings:")
         print(client)
     else:
         print("Connecting to remote schedueler at {}".format(
             self.schedueler_endpoint))
         scheduler_address = '{}:{}'.format(self.schedueler_endpoint,
                                            self.schedueler_port)
         client = Client(address=scheduler_address)
         print("Client settings:")
         print(client)
         client.get_versions(check=self.enforce_check)
     return client
Ejemplo n.º 3
0
class DaskCluster:
    """
    Generic dask cluster wrapper for parallel processing in blocks.

    This object takes in a computing function for one block in space.
    For the computing function:
        1. the output is always several matrices (or None) and one box.
        2. the number of matrices may vary for different applications/functions.
        3. all matrices will be in 2D in size of (len, wid) or 3D in size of (n, len, wid),
           thus, the last two dimension (in space) will be the same.
    This charateristics allows the automatic result collection without prior knowledge
        of the computing funciton, thus being a generic wrapper.

    Check ifgram_inversion.py as an example.

    """
    def __init__(self, cluster_type, num_worker, config_name=None, **kwargs):
        """Initiate object
        :param cluster_type: str, cluster to use (local, slurm, lsf, pbs)
        :param num_worker: str, number of workers to use
        :param config_name: str, the name of configuratino section
        :other param **kwargs: dask configuration parameters
                 e.g. config_name: str, the user specified config name to use
        """

        self.cluster_type = cluster_type.lower()
        self.num_worker = num_worker
        self.config_name = config_name
        self.cluster_kwargs = kwargs

        ## format input arguments
        # num_worker
        self.num_worker = self.format_num_worker(self.cluster_type,
                                                 self.num_worker)

        # config_name
        self.format_config_name()
        self.cluster_kwargs['config_name'] = self.config_name

        ## printout message
        print("input Dask cluster type: {}".format(self.cluster_type))
        if self.config_name is not None:
            print("input Dask config name: {}".format(self.config_name))

        ## intitial value
        self.cluster = None
        self.client = None

    def open(self):
        """Initiate the cluster"""

        # initiate the cluster object
        # Look at the ~/.config/dask/mintpy.yaml file for changing the Dask configuration defaults
        print('initiate Dask cluster')
        if self.cluster_type == 'local':
            from dask.distributed import LocalCluster

            # initiate cluster object
            self.cluster = LocalCluster()

        else:
            # for non-local cluster, import related dask module only when it's needed
            # because job_queue is not available on macports, which make sense
            import dask_jobqueue

            # initiate cluster object
            if self.cluster_type == 'lsf':
                self.cluster = dask_jobqueue.LSFCluster(**self.cluster_kwargs)

            elif self.cluster_type == 'pbs':
                self.cluster = dask_jobqueue.PBSCluster(**self.cluster_kwargs)

            elif self.cluster_type == 'slurm':
                self.cluster = dask_jobqueue.SLURMCluster(
                    **self.cluster_kwargs)

            else:
                msg = 'un-recognized input cluster: {}'.format(
                    self.cluster_type)
                msg += '\nsupported clusters: {}'.format(CLUSTER_LIST)
                raise ValueError(msg)

            # show dask cluster job script for reference
            print("\n", self.cluster.job_script())
            # for debug
            debug_mode = False
            if debug_mode:
                with open('dask_command_run_from_python.txt', 'w') as f:
                    f.write(self.cluster.job_script() + '\n')

    def run(self, func, func_data, results):
        """Wrapper function encapsulating submit_workers and compile_workers.

        For a generic result collection without prior knowledge of the computing function,
        we assume that the output of "func" is: several 2D or 3D matrices + a box

        :param func: function, a python function to run in parallel
        :param func_data: dict, a dictionary of the argument to pass to the function
        :param results: list[numpy.ndarray], arrays of the appropriate structure representing
               the final output of processed box (need to be in the same order as the function passed in
               submit_workers returns in)
        :return: results: tuple(numpy.ndarray), the processed results of the box
        """
        from dask.distributed import Client

        # split the primary box into sub boxes for workers AND
        # update the number of workers based on split result
        box = func_data["box"]
        sub_boxes = split_box2sub_boxes(box,
                                        num_split=self.num_worker,
                                        dimension='x',
                                        print_msg=False)
        self.num_worker = len(sub_boxes)
        print(
            'split patch into {} sub boxes in x direction for workers to process'
            .format(self.num_worker))

        # start a bunch of workers from the cluster
        print('scale Dask cluster to {} workers'.format(self.num_worker))
        self.cluster.scale(self.num_worker)

        print('initiate Dask client')
        self.client = Client(self.cluster)
        self.client.get_versions(check=True)

        # submit job for each worker
        futures, submission_time = self.submit_job(func, func_data, sub_boxes)

        # assemble results from all workers
        results = self.collect_result(futures, results, box, submission_time)

        return results

    def submit_job(self, func, func_data, sub_boxes):
        """Submit dask workers to the networking client that run the specified function (func)
        on the specified data (func_data). Each dask worker is in charge of a small subbox of the main box.

        :param func: function, a python function to run in parallel
        :param func_data: dict, a dictionary of the argument to pass to the function
        :param sub_boxes: list(np.nd.array), list of boxes to be computed in parallel

        :return futures: list(dask.Future), list of futures representing future dask worker calculations
        :return submission_time: time, the time of submission of the dask workers (used to determine worker
                runtimes as a performance diagnostic)
        """

        submission_time = time.time()
        futures = []
        for i, sub_box in enumerate(sub_boxes):
            print('submit a job to the worker for sub box {}: {}'.format(
                i, sub_box))
            func_data['box'] = sub_box

            # David: I haven't played with fussing with `retries`, however sometimes a future fails
            # on a worker for an unknown reason. retrying will save the whole process from failing.
            # TODO:  I don't know what to do if a future fails > 3 times. I don't think an error is
            # thrown in that case, therefore I don't know how to recognize when this happens.
            future = self.client.submit(func, **func_data, retries=3)
            futures.append(future)

        return futures, submission_time

    def collect_result(self, futures, results, box, submission_time):
        """Compile results from completed workers and recompiles their sub outputs into the output
        for the complete box being worked on.
        :param futures: list(dask.Future), list of futures representing future dask worker calculations
        :param results: list[numpy.ndarray], arrays of the appropriate structure representing
               the final output of processed box (need to be in the same order as the function passed in
               submit_workers returns in)
        :param box: numpy.ndarray, the initial complete box being processed
        :param submission_time: time, the time of submission of the dask workers (used to determine worker
               runtimes as a performance diagnostic)
        :return: results: tuple(numpy.ndarray), the processed results of the box
        """
        from dask.distributed import as_completed

        num_future = 0
        for future, sub_results in as_completed(futures, with_results=True):

            # message
            num_future += 1
            sub_t = time.time() - submission_time
            print("FUTURE #{} complete. Time used: {:.0f} seconds".format(
                num_future, sub_t))

            # catch result - sub_box
            # and convert the abosulte sub_box into local col/row start/end relative to the primary box
            # to assemble the result from each worker
            sub_box = sub_results[-1]
            x0, y0, x1, y1 = sub_box
            x0 -= box[0]
            x1 -= box[0]
            y0 -= box[1]
            y1 -= box[1]

            # catch result - matrices
            # and loop across all of the returned data to rebuild complete box
            for i, sub_result in enumerate(sub_results[:-1]):
                if sub_result is not None:
                    num_dim = sub_result.ndim
                    if num_dim == 4:
                        results[i][:, :, y0:y1, x0:x1] = sub_result
                    elif num_dim == 3:
                        results[i][:, y0:y1, x0:x1] = sub_result
                    elif num_dim == 2:
                        results[i][y0:y1, x0:x1] = sub_result
                    else:
                        msg = "worker result has unexpected dimension: {}".format(
                            num_dim)
                        msg += '\nit should be either 2 or 3 or 4!'
                        raise Exception(msg)

        return results

    def close(self):
        """Close connections to dask client and cluster and moves dask output/error files. """

        self.cluster.close()
        print('close dask cluster')

        self.client.close()
        print('close dask client')

        # move *.o/.e files produced by dask in stdout/stderr
        self.move_dask_stdout_stderr_files()

    ##### Utilities functions

    @staticmethod
    def format_num_worker(cluster_type, num_worker):
        """Format dask num_worker.
        :param cluster_type: str
        :param num_worker: str, number of workers to use
        :return: num_worker: int, number of workers to use
        """

        if cluster_type == 'local':
            num_core = os.cpu_count()

            # all / percentage --> num_core
            msg = f'numWorker = {num_worker}'
            if num_worker == 'all':
                ## divide by the number of threads per core [for Linux only]
                #import subprocess
                #from mintpy.utils import utils0 as ut0
                #if ut0.which('lscpu') is not None:
                #    # get the number of threads per core
                #    # link: https://stackoverflow.com/questions/62652951
                #    ps = subprocess.run(['lscpu'], capture_output=True, text=True).stdout.split('\n')
                #    ns = [p.split(':')[1].strip() for p in ps if p.startswith('Thread(s) per core:')]
                #    if len(ns) > 0:
                #        num_thread = int(ns[0])
                #        num_core = int(num_core / num_thread)

                # set num_worker to the number of cores
                num_worker = str(num_core)
                print('translate {} to {}'.format(msg, num_worker))

            elif num_worker.endswith('%'):
                num_worker = int(num_core * float(num_worker[:-1]) / 100)
                print('translate {} to {}'.format(msg, num_worker))
                if num_worker < 1 or num_worker >= num_core:
                    raise ValueError('Invalid numWorker percentage!')

            # str --> int
            num_worker = int(num_worker)

            # if num_worker > num_core,
            # then we assume that the user is not aware of the available resources
            # and use max(num_core/2, 1) instead to be conservative.
            if num_worker > num_core:
                print(
                    '\nWARNING: input number of worker: {} > available cores: {}'
                    .format(num_worker, num_core))
                num_worker = max(int(num_core / 2), 1)
                print('change number of worker to {} and continue\n'.format(
                    num_worker))

        else:
            if num_worker == 'all':
                msg = 'numWorker = all is NOT supported for cluster type: {}'.format(
                    cluster_type)
                raise ValueError(msg)
            num_worker = int(num_worker)

        return num_worker

    def format_config_name(self):
        """Format dask config_name property based on presence or absence of user specified config name.

        :return: config_name: str, the config_name formatted as follows:
                 - the user specified config name if its exists in $DASK_CONFIG/dask.yaml
                 - the default cluster_type config in $DASK_CONFIG/dask.yaml
        """
        import dask

        # config_name is not needed for local cluster
        if self.cluster_type == 'local':
            self.config_name = None
            return self.config_name

        # translate config_name = None to config_name = cluster_type
        if self.config_name is None:
            print(
                'input config name is None, thus use the default (same as cluster type)'
            )
            self.config_name = self.cluster_type

        # if config_name not found, use cluster_type as defined in minpy.dask
        config_names = list(dask.config.get('jobqueue').keys())
        if self.config_name not in config_names:
            config_location = dask.config.get('config')
            msg = 'Dask configuration "{}" was not found in {}'.format(
                self.config_name, config_location)
            msg += '\nFalling back to default config name: "{}"'.format(
                self.cluster_type)
            print(msg)
            self.config_name = self.cluster_type

        return self.config_name

    def move_dask_stdout_stderr_files(self):
        """Move *o and *e files produced by dask into stdout and sderr directory"""

        stdout_files = glob.glob('*.o')
        stderr_files = glob.glob('*.e')
        job_files = glob.glob('dask_command_run_from_python.txt*')

        if len(stdout_files + stderr_files + job_files) == 0:
            return

        stdout_folder = 'stdout_dask'
        stderr_folder = 'stderr_dask'
        for std_dir in [stdout_folder, stderr_folder]:
            if os.path.isdir(std_dir):
                shutil.rmtree(std_dir)
            os.mkdir(std_dir)

        for item in stdout_files + job_files:
            shutil.move(item, stdout_folder)

        for item in stderr_files:
            shutil.move(item, stderr_folder)
Ejemplo n.º 4
0
def run_dask_function(config):
    """Start a Dask Cluster using dask-kubernetes and run a function.

    Talks to kubernetes to create `n` amount of new `pods` with a dask worker inside of each
    forming a `dask` cluster. Then, a function specified from `config` is being imported and
    run with the given arguments. The tasks created by this `function` are being run on the
    `dask` cluster for distributed computation.

    The config dict must contain the following sections:
        * run
        * dask_cluster
        * output

    Args:
        config (dict):
            Config dictionary.
    """
    output_conf = config.get('output')
    if output_conf:
        path = output_conf.get('path')
        if not path:
            raise ValueError(
                'An output path must be provided when providing `output`.')

    cluster_spec = _generate_cluster_spec(config, kubernetes=False)
    cluster = KubeCluster.from_dict(cluster_spec)

    workers = config['dask_cluster'].get('workers')

    if not workers:
        cluster.adapt()
    elif isinstance(workers, int):
        cluster.scale(workers)
    else:
        cluster.adapt(**workers)

    client = Client(cluster)
    client.get_versions(check=True)

    try:
        run = _import_function(config['run'])
        kwargs = config['run']['args']
        results = run(**kwargs)

    finally:
        client.close()
        cluster.close()

    if output_conf:
        bucket = output_conf.get('bucket')

        try:
            if bucket:
                aws_key = output_conf.get('key')
                aws_secret = output_conf.get('secret_key')
                _upload_to_s3(bucket, path, results, aws_key, aws_secret)
            else:
                os.makedirs(os.path.dirname(path), exist_ok=True)
                results.to_csv(path)

        except Exception:
            print('Error storing results. Falling back to console dump.')
            print(_df_to_csv_str(results))

    else:
        return results
Ejemplo n.º 5
0
def main():

    #ssh = SmartMetHandler(options.smartmet_config_filename, options.smartmet_config_name, sleep_time=options.requests_throttle_time, param_section='forest_params')

    if hasattr(options, 'dask'):
        client = Client('{}:8786'.format(options.dask))
    else:
        client = Client()

    #print(get_forest_params('cnf/smartmet.yaml', False))
    #print(get_forest_params('cnf/smartmet.yaml', True))
    #print(client.run(get_version))
    #sys.exit()

    client.get_versions(check=True)
    logging.info(client)

    db_params = db_config(options.db_config_filename, options.db_config_name)

    s3 = boto3.resource('s3')

    # Load params
    content_object = s3.Object(conf_bucket, conf_file)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    config_dict = yaml.load(file_content, Loader=yaml.FullLoader)

    params = config_dict['params']
    met_params = set()
    shortnames = True
    for param, info in params.items():
        for f in info['aggregation']:
            if shortnames:
                met_params.add(f[1:] + ' ' + info['name'])
            else:
                met_params.add(f + '{' + param + '}')
    met_params = list(met_params)

    polygon_params = ['speed_self', 'angle_self', 'area_m2', 'area_diff']
    meta_params = [
        'id', 'storm_id', 'point_in_time', 'weather_parameter', 'low_limit',
        'high_limit'
    ]
    geom_params = ['geom']
    outage_params = ['outages', 'customers']
    transformers_params = ['transformers', 'all_customers']

    storm_params = polygon_params + met_params

    all_params = meta_params + geom_params + storm_params + outage_params

    # Read data from database

    starttime = datetime.datetime.strptime(options.starttime, "%Y-%m-%d")
    endtime = datetime.datetime.strptime(options.endtime, "%Y-%m-%d")

    config, forest_params = _config(options.smartmet_config_filename,
                                    options.smartmet_config_name,
                                    'forest_params')
    metas = {}
    for param in params_to_list(forest_params, True):
        metas[param] = 'float'

    dfs = []
    start = starttime
    while start <= endtime:
        end = start + timedelta(days=1)
        #start, end, dataset, db_params, meta_params, geom_params, storm_params, outage_params, transformers_params, all_paramm, config, forest_params, dataset_table
        dfs.append(
            client.submit(process_time_range, start.strftime('%Y-%m-%d'),
                          end.strftime('%Y-%m-%d'), options.dataset, db_params,
                          met_params, meta_params, geom_params, storm_params,
                          outage_params, transformers_params, all_params,
                          config, forest_params, options.dataset_table))

        start = end
        if end > endtime: end = endtime

    for i, d in enumerate(dfs):
        logging.info(client.gather(d))
Ejemplo n.º 6
0
    logger.info('[{}] : [INFO] Exiting EDE framework'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))


if __name__ == "__main__":
    def handler(singal_received, frame):
        logger.info('[{}] : [INFO] User break detected. Exiting EDE framework'.format(
        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')))
        sys.exit(0)
    signal(SIGINT, handler)
    SchedulerEndpoint, Scale, SchedulerPort, EnforceCheck = check_dask_settings()  # Todo Better solution
    if SchedulerEndpoint:
        if SchedulerEndpoint == "local":
            cluster = LocalCluster(n_workers=int(Scale))
            logger.info('[{}] : [INFO] Starting Dask local Cluster Backend with: {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), cluster))
            client = Client(cluster)
            logger.info('[{}] : [INFO] Dask Client started with: {}'.format(
                datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), client))
        else:
            scheduler_address = "{}:{}".format(SchedulerEndpoint, SchedulerPort)
            client = Client(address=scheduler_address)
            client.get_versions(check=EnforceCheck)
    else:
        cluster = 0
        client = 0
    main(sys.argv[1:],
         cluster,
         client)

Ejemplo n.º 7
0
def main():

    if hasattr(options, 'dask'):
        client = Client('{}:8786'.format(options.dask))
    else:
        client = Client()

    #print(client.run(get_version))
    #sys.exit()

    client.get_versions(check=True)
    logging.info(client)

    db_params = db_config(options.db_config_filename, options.db_config_name)

    s3 = boto3.resource('s3')

    # Load params
    content_object = s3.Object(conf_bucket, conf_file)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    config_dict = yaml.load(file_content, Loader=yaml.FullLoader)

    params = config_dict['params']
    met_params = set()
    shortnames = True
    for param, info in params.items():
        for f in info['aggregation']:
            if shortnames:
                met_params.add(f[1:]+' '+info['name'])
            else:
                met_params.add(f+'{'+param+'}')
    met_params = list(met_params)

    polygon_params = ['speed_self', 'angle_self', 'area_m2', 'area_diff']
    meta_params = ['id', 'storm_id', 'point_in_time', 'weather_parameter', 'low_limit', 'high_limit']
    geom_params = ['geom']
    outage_params = ['outages', 'customers']
    transformers_params = ['transformers', 'all_customers']

    storm_params = polygon_params + met_params

    all_params = meta_params + geom_params + storm_params + outage_params

    # Read data from database

    starttime = datetime.datetime.strptime(options.starttime, "%Y-%m-%d")
    endtime = datetime.datetime.strptime(options.endtime, "%Y-%m-%d")

    logging.info('Reading data for {}-{}'.format(starttime, endtime))

    dfs, df = [], []
    start = starttime
    while start <= endtime:

        end = start + timedelta(days=1)

        if options.dataset == 'loiste_jse':
            #dfs.append(delayed(create_dataset_loiste_jse)(db_params, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'), meta_params, geom_params, storm_params, outage_params, transformers_params, all_params))
            dfs.append(client.submit(create_dataset_loiste_jse, db_params, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'), meta_params, geom_params, storm_params, outage_params, transformers_params, all_params))
        else:
            dfs.append(client.submit(create_dataset_energiateollisuus, db_params, start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d'), meta_params, geom_params, storm_params, outage_params, transformers_params, all_params))

        start = end

    for i, d in enumerate(dfs):
        try:
            dfs[i] = client.gather(d)
        except psycopg2.OperationalError as e:
            logging.error(e)
            dfs[i] = client.gather(d)

    with ProgressBar():
        df = dask.compute(*dfs)

    dataset = pd.concat(df)

    logging.info('Reading data from DB done. Found {} records'.format(len(dataset)))

    paths = [
        #('Forest FRA', 's3://fmi-asi-data-puusto/luke/2017/fra_luokka/puusto_fra_luokka_suomi_4326.tif'),
        ('Forest age', 's3://fmi-asi-data-puusto/luke/2017/ika/ika_suomi_4326_lowres.tif'),
        ('Forest site fertility', 's3://fmi-asi-data-puusto/luke/2017/kasvupaikka/kasvupaikka_suomi_4326_lowres.tif'),
        ('Forest stand mean diameter', 's3://fmi-asi-data-puusto/luke/2017/keskilapimitta/keskilapimitta_suomi_4326_lowres.tif'),
        ('Forest stand mean height', 's3://fmi-asi-data-puusto/luke/2017/keskipituus/keskipituus_suomi_4326_lowres.tif'),
        ('Forest canopy cover', 's3://fmi-asi-data-puusto/luke/2017/latvusto/latvusto_suomi_4326_lowres.tif'),
        ('Forest site main class', 's3://fmi-asi-data-puusto/luke/2017/paatyyppi/paatyyppi_suomi_4326_lowres.tif')
        ]

    #paths = [('Forest canopy cover', 's3://fmi-asi-data-puusto/luke/2017/latvusto/puusto_latvusto_suomi_4326.tif')]
    chunks = {'y': 5000, 'x': 5000}

    ars = []
    for name, path in paths:
        #filename = get_file(path)
        ars.append((name, xr.open_rasterio(path, chunks=chunks)))

    # Initiate forest data columns
    operations = ['mean', 'max', 'std']
    metas = {}
    for name, path in paths:
        meta = {}
        for op in operations:
            opname = '{} {}'.format(op, name)
            #dataset[opname] = np.nan
            meta[opname] = 'float'
        metas[name] = meta

    df = dd.from_pandas(dataset, npartitions=50)

    client.scatter(ars)
    client.scatter(df)

    with ProgressBar():
        #dataset = df.apply(lambda row: delayed(stats)(row, ars), axis=1)
        for name, ar in ars:
            forest_data = df.geom.map_partitions(stats, metas[name], ar, meta=pd.DataFrame(metas[name], index=df.index)).compute().reset_index(drop=True)
            dataset = dataset.reset_index(drop=True).join(forest_data)

    logging.info('\nDone. Found {} records'.format(dataset.shape[0]))

    dataset.loc[:,['outages','customers']] = dataset.loc[:,['outages','customers']].replace('None', np.nan)
    dataset.loc[:,['outages','customers']] = dataset.loc[:,['outages','customers']].fillna(0)
    dataset.loc[:,['outages','customers']] = dataset.loc[:,['outages','customers']].astype(float)
    dataset.loc[:,['outages','customers']] = dataset.loc[:,['outages','customers']].astype(int)
    #print(dataset.loc[:, ['outages', 'customers']])
    #print('--')

    # Drop storm objects without customers or transformers, they are outside the range
    if options.dataset == 'loiste_jse':
        dataset.dropna(axis=0, subset=['all_customers', 'transformers'], inplace=True)

    # Drop rows with missing meteorological params
    for p in met_params:
        dataset = dataset[dataset[p] != -999]

    dataset.sort_values(by=['outages'], inplace=True)

    # Cast classes

    # outages
    limits = [(0,0), (1,2), (3,10), (11, 9999999)]
    i = 0
    for low, high in limits:
        dataset.loc[(dataset.loc[:, 'outages'] >= low) & (dataset.loc[:, 'outages'] <= high), 'class'] = i
        i += 1

    # customers
    limits = [(0,0), (1,250), (251,500), (501, 9999999)]
    i = 0
    for low, high in limits:
        dataset.loc[(dataset.loc[:, 'customers'] >= low) & (dataset.loc[:, 'customers'] <= high), 'class_customers'] = i
        i += 1

    #print(dataset.loc[:, ['class', 'class_customers']])
    #dataset.loc[:, ['class', 'class_customers']] = dataset.loc[:, ['class', 'class_customers']].fillna(0)
    dataset.fillna(0, inplace=True)
    dataset.loc[:, ['class', 'class_customers']] = dataset.loc[:, ['class', 'class_customers']].astype(float)
    dataset.loc[:, ['class', 'class_customers']] = dataset.loc[:, ['class', 'class_customers']].astype(int)

    dataset.drop(columns=['geom'], inplace=True)

    logging.info("dataset:\n{}".format(dataset.head(1)))
    logging.info("\n{}".format(dataset.dtypes))
    logging.info("\n{}".format(dataset.shape))

    # Save
    try:
        save_dataset(dataset, db_params, table_name=options.dataset_table)
    except BrokenPipeError as e:
        logging.warning(e)
        save_dataset(dataset, db_params, table_name=options.dataset_table)
                           nanny=True,
                           death_timeout='600s',
                           local_directory=args.local_directory,
                           shebang='#!/usr/bin/env bash',
                           env_extra=["export TBB_CXX_TYPE=gcc"],
                           job_extra=args.job_extra.split(','),
                           queue=args.queue)
    print(args)
    print(cluster.job_script())
    cluster.scale(jobs=args.nodes)
    client = Client(cluster)
    print(client)
    client.wait_for_workers(args.nodes)
    time.sleep(60)

    print(client.get_versions(check=True))

    table = load_table(args.biom_table)
    counts = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                          index=table.ids(),
                          columns=table.ids(axis='observation'))
    metadata = pd.read_table(args.metadata_file, index_col=0)
    replicates = metadata[args.replicates]
    batches = metadata[args.batches]
    # match everything up
    idx = list(set(counts.index) & set(replicates.index) & set(batches.index))
    counts, replicates, batches = [x.loc[idx] for x in
                                   (counts, replicates, batches)]
    replicates, batches = replicates.values, batches.values
    depth = counts.sum(axis=1)
Ejemplo n.º 9
0
items = [1, 2, 3]

computation_graph = sum_list([square(i) for i in items])  # 실행 안됨

computation_graph.visualize()
print("Result", computation_graph.compute())  # 실행 됨. session하고 같은 역할

#

# dask-worker 1192.168: 이걸 3개의 텐서플로 아나콘다 열어서 쳐 넣으면 붙음.

#
from dask.distributed import Client
client = Client('192.168.0.17:8786')
client.get_versions(check=True)


def square(x):
    return x**2


def neg(x):
    return -x


A = client.map(square, range(10))
B = client.map(neg, A)
total = client.submit(sum, B)
total.result()
total.gather()