Ejemplo n.º 1
0
    def handle(self, *args, **options):
        # Unpack variables
        name = options['name']
        model = options['model']
        segmentation = options['segmentation']
        spatial_aggregation = options['spatial_aggregation']
        categorical_variables = options['categorical_variables']
        scheduler_file = options['scheduler']

        # datacube query
        gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']}
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run 
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(predict_object,
                       iterable,
                       pure=False,
                       **{'model_name': model,
                          'segmentation_name': segmentation,
                          'categorical_variables': categorical_variables,
                          'aggregation': spatial_aggregation,
                          'name': name,
                          })
        result = client.gather(C)

        print('Successfully ran prediction on %d tiles' % sum(result))
        print('%d tiles failed' % result.count(False))
Ejemplo n.º 2
0
def setup_cluster(config: cpb.ConstructConfig, faiss_index_path: Path) -> None:
    # Connect
    if config.cluster.run_locally:
        print("Running on local machine!")
        dask_client = None
    else:
        cluster_address = f"{config.cluster.address}:{config.cluster.port}"
        print("Configuring Dask, attaching to cluster")
        print(f"\t- {cluster_address}")
        dask_client = Client(address=cluster_address)
        if config.cluster.restart:
            print("\t- Restarting cluster...")
            dask_client.restart()
        print(f"\t- Running on {len(dask_client.nthreads())} machines.")

    # Initialize Helper Objects on each worker
    preloader = dpg.WorkerPreloader()
    preloader.register(*text_util.get_scispacy_initalizer(
        scispacy_version=config.parser.scispacy_version, ))
    preloader.register(*text_util.get_stopwordlist_initializer(
        stopword_path=config.parser.stopword_list))
    preloader.register(*embedding_util.get_pytorch_device_initalizer(
        disable_gpu=config.sys.disable_gpu, ))
    preloader.register(*embedding_util.get_bert_initializer(
        bert_model=config.parser.bert_model, ))
    # This actual file path will need to be created during the pipeline before use
    preloader.register(*knn_util.get_faiss_index_initializer(
        faiss_index_path=faiss_index_path, ))
    # If semrep is installed and congiured with agatha
    if (config.semrep.HasField("semrep_install_dir")
            and config.semrep.HasField("metamap_install_dir")):
        preloader.register(*semrep_util.get_metamap_server_initializer(
            metamap_install_dir=config.semrep.metamap_install_dir))
    dpg.add_global_preloader(client=dask_client, preloader=preloader)
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        # Unpack variables
        model_id = options['model_id']
        out_dir = options['out_dir']

        # Create output dir if does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # datacube query
        gwf_kwargs = {
            k: options[k]
            for k in ['product', 'lat', 'long', 'region']
        }
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run
        client = Client()
        client.restart()
        C = client.map(predict_pixel_tile, iterable, **{
            'model_id': model_id,
            'outdir': out_dir
        })
        filename_list = client.gather(C)
        print(filename_list)
Ejemplo n.º 4
0
def connect_to_dask_cluster(config: cpb.AbstractGeneratorConfig) -> None:
    # Potential cluster
    if config.cluster.run_locally or config.cluster.address == "localhost":
        print("Running dask on local machine!")
    else:
        cluster_address = f"{config.cluster.address}:{config.cluster.port}"
        print("Configuring Dask, attaching to cluster")
        print(f"\t- {cluster_address}")
        dask_client = Client(address=cluster_address)
        if config.cluster.restart:
            print("\t- Restarting cluster...")
            dask_client.restart()
Ejemplo n.º 5
0
def connnect_glue():
    # os.system('dask-ssh 128.104.222.{103,104,105,107}')
    # subprocess.call('dask-ssh', '128.104.222.{103,104,105,107}')
    # time.sleep(10)
    import numpy as np

    client = Client('128.104.222.103:8786')
    client.restart()
    x = da.from_zarr('/mnt/cephfs/smltar_numpyarr/zarr_data_full')
    print(x)
    # y = x[0:1]
    # z = x[100:101]
    # m = x[1000:1001]
    # n = x[1500:1501]
    # p = x[1400:1401]

    y = x[0:30]
    z = x[100:130]
    m = x[1000:1030]
    n = x[1500:1530]
    p = x[1400:1430]

    # zc = x[108:208]
    # mc = x[1008:1108]
    # nc = x[1508:1608]
    # pc = x[1601:1701]
    #
    sum = (y + z - m + p) * n
    #
    # sum2 = (zc + mc + nc +pc)*sum
    #
    # sum3 = sum2 + (zc + mc + nc +pc)*sum
    #
    #
    #
    # print(sum2)
    # sum.visualize('sum3')

    # frm = sum[15]
    fu = client.compute(sum)
    # p = r.result()
    # print(type(p))
    # return p
    re = fu.result()

    re = np.array(re)
    np.save("/mnt/cephfs/result/test", re[15])

    # print(p)
    return re[15]
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        # Unpack variables
        algorithm = options['algorithm']
        bands = options['bands']
        name = options['name']
        product_pre = options['product_pre']
        product_post = options['product_post']
        lc_pre = options['lc_pre']
        lc_post = options['lc_post']
        year_pre = options['year_pre']
        year_post = options['year_post']
        filter_labels = options['filter_labels']
        mmu = options['mmu']
        extra_args = parser_extra_args(options['extra_kwargs'])
        scheduler_file = options['scheduler']

        # Build segmentation meta object
        meta, _ = ChangeInformation.objects.get_or_create(year_pre=year_pre,
                                                          year_post=year_post,
                                                          algorithm=algorithm,
                                                          name=name)

        # Build gwf_kwargs, send a query for both products, combine the dict and generate iterable
        gwf_kwargs = {k: options[k] for k in ['lat', 'long', 'region']}
        pre_dict = gwf_query(product_pre, view=False, **gwf_kwargs)
        post_dict = gwf_query(product_post, view=False, **gwf_kwargs)
        iterable = join_dicts(pre_dict, post_dict, join='inner').items()

        # Start cluster and run
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(detect_and_classify_change,
                       iterable,
                       pure=False,
                       **{
                           'algorithm': algorithm,
                           'change_meta': meta,
                           'band_list': bands,
                           'mmu': mmu,
                           'lc_pre': lc_pre,
                           'lc_post': lc_post,
                           'extra_args': extra_args,
                           'filter_labels': filter_labels
                       })
        result = client.gather(C)

        print('Successfully ran change detection on %d tiles' % sum(result))
        print('%d tiles failed' % result.count(False))
Ejemplo n.º 7
0
    def handle(self, *args, **options):
        # Unpack variables
        product = options['product']
        algorithm = options['algorithm']
        extra_args = parser_extra_args(options['extra_kwargs'])
        bands = options['bands']
        datasource = options['datasource']
        year = options['year']
        name = options['name']
        scheduler_file = options['scheduler']

        # Build segmentation meta object
        meta, _ = SegmentationInformation.objects.get_or_create(
            algorithm=algorithm,
            datasource=datasource,
            parameters=json.dumps(extra_args),
            datasource_year=year,
            name=name,
        )

        # datacube query
        gwf_kwargs = {
            k: options[k]
            for k in ['product', 'lat', 'long', 'region']
        }
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(segment,
                       iterable,
                       pure=False,
                       **{
                           'algorithm': algorithm,
                           'segmentation_meta': meta,
                           'band_list': bands,
                           'extra_args': extra_args
                       })
        result = client.gather(C)

        print('Successfully ran segmentation on %d tiles' % sum(result))
        print('%d tiles failed' % result.count(False))
Ejemplo n.º 8
0
class RunDirectory:
    """Open data in experiment folder."""

    weightfile = None
    griddes = None

    def __enter__(self):
        """
        Create enter method.

        The enter method just returns the object it self. It is used
        to work along the with __exit__ method that closes a distributed
        worker.
        """
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Close the distributed client befor exiting."""
        self.close_client()

    def __init__(self,
                 run_dir,
                 *,
                 prefix=None,
                 model_type=None,
                 overwrite=False,
                 f90name_list=None,
                 filetype='nc',
                 client=None):
        """
        Create an RunDirecotry object from a given input directory.

        ::

            run = RunDirectory('/work/mh0066/precip-project/3-hourly/CMORPH')

        The RunDirectory object gathers all nesseccary information on the
        data that is stored in the run directory. Once loaded the most
        important meta data will be stored in the run directory for faster
        access the second time.

        Parameters
        ----------
        run_dir: str
            Name of the directory where the data that should be read is stored.

        prefix: str, optional (default: None)
            filname prefix
        model_type: str, optional (default: None)
            model name/ observation porduct that created the data. This will
            be used to generate a variable lookup table. This can be useful
            for loading various model datasets and comparing them while only
            accessing the data with one set of variable names. By default
            no lookupt table will be generated.
        overwrite: bool, optional (default : False)
            If true the meta data will be generated again even if it has been
            stored to disk already.
        f90name_list: str, optional (default: None)
            Filename to an optional f90 namelist with additional information
            about the data
        filetype: str, optional (default: nc)
            Input data file format
        client: dask.distributed cleint, optional (default: None)
            Configuration that is used the create a dask client which recieves
            tasks for multiproccessing. By default (None) a local client will
            be started.

        """
        if isinstance(client, Client):
            self.dask_client = client
        else:
            self.dask_client = Client(client)
        self.prefix = prefix or ''
        self.variables = lookup(model_type)
        run_dir = op.abspath(str(run_dir))
        nml_file = f90name_list or 'NAMELIST_{}*'.format(prefix)
        info_file = self._hash_file(run_dir)
        if overwrite or not info_file.is_file():
            self.name_list = {}
            for nml_file in Path(run_dir).rglob(nml_file):
                self.name_list = {
                    **self.name_list,
                    **f90nml.read(str(run_dir / nml_file))
                }
            self.name_list['output'] = self._get_files(run_dir, filetype)
            self.name_list['weightfile'] = None
            self.name_list['gridfile'] = self.griddes
            self.name_list['run_dir'] = op.abspath(str(run_dir))
            self._dump_json(run_dir)
        else:
            with open(str(info_file), 'r') as f:
                self.name_list = json.load(f)

    @staticmethod
    def _hash_file(run_dir):
        run_dir = op.expanduser(str(run_dir))
        hash_obj = hashlib.md5(op.abspath(run_dir).encode())
        hash_str = str(hash_obj.hexdigest())
        return _cache_dir / Path('run_info_{}.json'.format(hash_str))

    @staticmethod
    def _get_files(run_dir, extensions):
        """Get all netcdf filenames."""
        ext_str = ''.join(
            ['[{}{}]'.format(l.lower(), l.upper()) for l in extensions])
        pat = re.compile('^(?!.*restart|.*remap).*{}'.format(ext_str))
        glob_pad = '*.{}'.format(ext_str)
        result = sorted([
            f.as_posix() for f in Path(run_dir).rglob(glob_pad)
            if re.match(pat, f.as_posix())
        ])
        return result

    @staticmethod
    def _remap(infile,
               out_dir=None,
               griddes=None,
               weightfile=None,
               method=None,
               gridfile=None,
               options=None):
        options = options or '-f nc4'
        if isinstance(infile, (str, Path)):
            infile = Path(infile)
            out_file = str(Path(out_dir) / infile.with_suffix('.nc').name)
        else:
            out_file = None
        with NamedTemporaryFile(dir=out_dir, suffix='.nc') as tf_in:

            if method == 'weighted':
                cdo_str = str(griddes) + ',' + str(weightfile)
                remap_func = getattr(cdo, 'remap')
            else:
                cdo_str = str(griddes)
                remap_func = getattr(cdo, method)
            if gridfile is not None:
                cdo_str += ' -setgrid,' + str(gridfile)

            if isinstance(infile, xr.DataArray):
                _ = xr.Dataset(data_vars={
                    infile.name: infile
                }).to_netcdf(tf_in.name)
                kwargs = dict(returnXArray=infile.name)
                infile = Path(tf_in.name)
            elif isinstance(infile, xr.Dataset):
                _ = infile.to_netcdf(tf_in.name)
                infile = Path(tf_in.name)
                kwargs = dict(returnXDataset=True)
            else:
                kwargs = dict(output=str(out_file), options=options)

            out = remap_func('{} {}'.format(str(cdo_str), str(infile)),
                             **kwargs)
            try:
                return out.compute()
            except AttributeError:
                return out

    @property
    def run_dir(self):
        """Get the name of the experiment path."""
        return Path(self.name_list['run_dir'])

    @property
    def files(self):
        """Return all files that have been opened."""
        return pd.Series(self.name_list['output'])

    @staticmethod
    def apply_function(mappable,
                       collection,
                       *,
                       args=None,
                       client=None,
                       **kwargs):
        """
        Apply function to given collection.

        ::

            result = run.apply_function(lambda d, v: d[v].sum(dim='time'),
                                        run.dataset, args=('temp',))

        Parameters
        ----------

        mappable: method
            method that is applied

        collection: collection
            collection that is distributed in a thread pool

        args:
            additional arguments passed into the method

        client: dask distributed client (default: None)
            worker scheduler client that submits the jobs. If None is given
            a new client is started

        progress: bool (default: True)
            display tqdm progress bar

        **kwargs: optional
            additional keyword arguments controlling the progress bar parameter

        Returns
        -------

            combined output of the thread-pool processes: collection
        """
        client = client or Client()
        args = args or ()
        if isinstance(collection, (xr.DataArray, xr.Dataset)):
            tasks = [(client.scatter(collection), *args)]
        else:
            tasks = [(client.scatter(entry), *args) for entry in collection]
        futures = [client.submit(mappable, *task) for task in tasks]
        progress = kwargs.pop('progress', True)
        if progress is True:
            progress_bar(futures, **kwargs)
        output = client.gather(futures)
        if len(output) == 1:  # Possibly only one job was submitted
            return output[0]
        return output

    def close_client(self):
        """Close the opened dask client."""
        self.dask_client.close()

    def restart_client(self):
        """Restart the opened dask client."""
        self.dask_client.restart()

    @property
    def status(self):
        """Query the status of the dask client."""
        return self.dask_client.status

    def remap(self,
              grid_description,
              inp=None,
              out_dir=None,
              *,
              method='weighted',
              weightfile=None,
              options='-f nc4',
              grid_file=None):
        """
        Regrid to a different input grid.

        ::

            run.remap('echam_griddes.txt', method='remapbil')

        Parameters
        ----------

        grid_description: str
                          Path to file containing the output grid description
        inp: (collection of) str, xarray.Dataset, xarray.DataArray
                Filenames that are to be remapped.
        out_dir: str (default: None)
                  Directory name for the output
        weight_file: str (default: None)
                     Path to file containing grid weights
        method: str (default: weighted)
                 Remap method that is applyied to the data, can be either
                 weighted (default), bil, con, laf, nn. If weighted is chosen
                 this class should have been instanciated either with a given
                 weightfile or using the gen_weights methods.
        weightfile: str (default: None)
                     File containing the weights for the distance weighted
                     remapping.
        grid_file: str (default: None)
                  file containing the source grid describtion
        options: str (default: -f nc4)
                 additional file options that are passed to cdo

        Returns
        -------

            Collection of output: (str, xarray.DataArray, xarray.Dataset)

        """
        out_dir = out_dir or TemporaryDirectory().name
        Path(out_dir).absolute().mkdir(exist_ok=True, parents=True)
        impl_methods = ('weighted', 'remapbil', 'remapcon', 'remaplaf',
                        'remapnn')
        weightfile = weightfile or self.weightfile
        if method not in impl_methods:
            raise NotImplementedError('Method not available.'
                                      ' Currently implemented'
                                      ' methods are:'
                                      'weighted, remapbil, '
                                      'remapcon, remaplaf, remapnn')
        if weightfile is None and method == 'weighted':
            raise ValueError('No weightfile was given, either choose different'
                             ' remapping method or instanciated the Reader'
                             ' object by providing a weightfile or generate '
                             'a weightfile by calling the gen_weights methods')

        args = (Path(out_dir), grid_description, weightfile, method, grid_file,
                options)
        run_dir = self.name_list['run_dir']
        if inp is None:
            inp = self.files
        elif isinstance(inp, (str, Path)):
            if not Path(inp).is_file():
                inp = sorted([f for f in Path(run_dir).rglob(inp)])
            else:
                inp = (inp, )
        if len(inp) == 0:
            raise FileNotFoundError('No files for remapping found')
        return self.apply_function(self._remap,
                                   inp,
                                   args=args,
                                   client=self.dask_client,
                                   label='Remapping')

    def _dump_json(self, run_dir):
        run_dir = op.abspath(str(run_dir))
        info_file = self._hash_file(run_dir)
        name_list = self.name_list
        name_list['run_dir'] = run_dir
        name_list['json_file'] = str(info_file.absolute())
        with open(str(info_file), 'w') as f:
            json.dump(name_list, f, sort_keys=True, indent=4)

    @classmethod
    def gen_weights(cls,
                    griddes,
                    run_dir,
                    *,
                    prefix=None,
                    model_type='ECHAM',
                    infile=None,
                    overwrite=False,
                    client=None):
        """
        Create grid weigths from grid description and instanciate class.

        ::

            run = RunDirectory.gen_weights('echam_grid.txt',
                            '/work/mh0066/precip-project/3-hourly/CMORPH/',
                            infile='griddes.nc')

        Parameters
        ----------

        griddess: str
            filename containing the desired output grid information
        run_dir: str
            path to the experiment directory
        prefix: str
            filename prefix
        model_type: str
            Model/Product name of the dataset to be read
        infile: str
            Path to input file. By default the method looks for appropriate
            inputfiles
        overwrite: bool, optional (default: False)
            should an existing weight file be overwritten

        Returns
        -------

            RunDirectory: RunDirectory object

        """
        try:
            out_file = [f
                        for f in Path(run_dir).absolute().rglob('*2d*.nc')][0]
        except IndexError:
            try:
                out_file = [f
                            for f in Path(run_dir).absolute().rglob('*.nc')][0]
            except IndexError:
                raise FileNotFoundError('Run Directory is empty')

        def get_input(rundir, inp_file):
            for file in (inp_file, op.join(rundir, 'o3_icon_DOM01.nc'),
                         op.join(rundir, 'bc_ozone.nc')):
                if op.isfile(str(file)):
                    return inp_file

        input_file = get_input(run_dir, infile)
        weight_file = op.abspath(op.join(run_dir, 'remapweights.nc'))
        if overwrite or not os.path.isfile(weight_file):
            cmd = '{} -setgrid,{} {}'.format(op.abspath(griddes), input_file,
                                             out_file)
            weight_file = cdo.gendis(cmd, output=weight_file)
        cls.gridfile = griddes
        cls.weightfile = op.abspath(weight_file)
        return cls(run_dir,
                   prefix=prefix,
                   model_type=model_type,
                   overwrite=overwrite,
                   client=client)

    def load_data(self, filenames=None, **kwargs):
        """
        Open a multifile dataset using xrarray open_mfdataset.

        ::

           dset = run.load_data('*2008*.nc')

        Parameters
        ----------

        filenames: collection/str
            collection of filenames, filename or glob pattern for filenames
            that should be read. Default behavior is reading all dataset files

        **kwargs: optional
            Additional keyword arguments passed to xarray's open_mfdataset

        Returns
        -------

            Xarray (multi-file) dataset: xarray.Dataset

        """
        filenames = self._get_files_from_glob_pattern(filenames) or self.files
        kwargs.setdefault('parallel', True)
        kwargs.setdefault('combine', 'by_coords')
        return xr.open_mfdataset(filenames, **kwargs)

    def _get_files_from_glob_pattern(self, filenames):
        """Construct filename to read."""
        if isinstance(filenames, (str, Path)):
            ncfiles = [
                filenames,
            ]
        elif filenames is None:
            return None
        else:
            ncfiles = list(filenames)
        read_files = []
        for in_file in ncfiles:
            if op.isfile(in_file):
                read_files.append(str(in_file))
            else:
                read_files += [
                    str(f) for f in self.run_dir.rglob(str(in_file))
                ]
        return sorted(read_files)
Ejemplo n.º 9
0
def Assignment1B(user_reviews_csv, products_csv):
    client = Client('127.0.0.1:8786')
    client = client.restart()

    # defining data types
    reviews_dtypes = {
        'reviewerID': np.str,
        'asin': np.str,
        'reviewerName': np.str,
        'helpful': np.object,
        'reviewText': np.str,
        'overall': np.float64,
        'summary': np.str,
        'unixReviewTime': np.float64,
        'reviewTime': np.str
    }

    products_dtypes = {
        'asin': np.str,
        'salesRank': np.object,
        'imUrl': np.str,
        'categories': np.object,
        'title': np.str,
        'description': np.str,
        'price': np.float64,
        'related': np.object,
        'brand': np.str
    }

    # instantiating dataframes as variables
    products = dd.read_csv(products_csv, dtype=products_dtypes)
    reviews = dd.read_csv(user_reviews_csv, dtype=reviews_dtypes)

    ### Question 1 ###

    # percentage of missing values for all columns in the reviews table and the products table
    products_missing_perc = np.mean(products.isnull()) * 100
    reviews_missing_perc = np.mean(reviews.isnull()) * 100

    ### Question 2 ###

    # using only the columns we need to join on
    reviews_sub = reviews[['asin', 'overall']]
    products_sub = products[['asin', 'price']]

    # declaring types for no typeerrors
    reviews_sub['asin'] = reviews_sub['asin'].astype(str)
    products_sub['asin'] = products_sub['asin'].astype(str)

    # joining the dataframes and calculating the pearson correlation
    merged_df = dd.merge(products_sub, reviews_sub, on='asin')
    pearson_correlation = merged_df[['price', 'overall']].corr()
    pearson_correlation = pearson_correlation['price']

    ### Question 3 ###

    # calculating the descriptive statistics
    descriptive_stats = products['price'].describe()

    ### Question 4 ###

    # aggregating over the categories column
    super_category = products['categories'].apply(get_super_category,
                                                  meta='str').value_counts()

    # parallelizing the individual questions
    q1a, q1b, q2, q3, q4, product_asin = dd.compute(
        products_missing_perc, reviews_missing_perc, pearson_correlation,
        descriptive_stats, super_category, products.asin)

    # converting each question to the correct format for writing into json
    q1a = q1a.round(2).to_dict()
    q1b = q1b.round(2).to_dict()
    q2 = q2['overall'].round(2)
    q3 = q3.round(2)[['mean', 'std', '50%', 'min', 'max']].to_dict()
    q4 = q4.to_dict()

    ### Question 5 ###

    # check if the review ids are in the computed product ids
    product_is_not_dangling = reviews.asin.isin(product_asin)
    if all(product_is_not_dangling) == True:
        q5 = 0
    else:
        q5 = 1

    ### Question 6 ###

    # extract just the related column as a dataframe
    products_related = products[['related']]

    # aggregate over just the related column as a series
    products_related['related'] = products_related.related.apply(get_related,
                                                                 meta='array')

    # get the list of product ids separated into individual values using .explode()
    asins = products_related.explode('related')

    # check if the list of product ids are in the computed product ids
    asin_is_not_dangling = asins.related.isin(product_asin)
    if all(asin_is_not_dangling) == True:
        q6 = 0
    else:
        q6 = 1

    # correct format according to PA1 writeup
    submit = {
        'q1': {
            'products': q1a,
            'reviews': q1b
        },
        'q2': q2,
        'q3': q3,
        'q4': q4,
        'q5': q5,
        'q6': q6
    }

    with open('results_PA1.json', 'w') as outfile:
        json.dump(submit, outfile)
Ejemplo n.º 10
0
        print(
            "Running pymoliere sentence_classifier with the following parameters:"
        )
        print(config)

        # Potential cluster
        if config.cluster.run_locally or config.cluster.address == "localhost":
            print("Running on local machine!")
        else:
            cluster_address = f"{config.cluster.address}:{config.cluster.port}"
            print("Configuring Dask, attaching to cluster")
            print(f"\t- {cluster_address}")
            dask_client = Client(address=cluster_address)
        if config.cluster.restart:
            print("\t- Restarting cluster...")
            dask_client.restart()

        # Need to make sure model_path is writable
        model_path.parent.mkdir(parents=True, exist_ok=True)
        # We're going to store model-specific checkpoints separately
        data_ckpt_dir.mkdir(parents=True, exist_ok=True)

        # All data, this is the checkpoint we depend on
        sentences_with_embedding = file_util.load(
            default_ckpt_dir.joinpath("sentences_with_embedding"))
        # Get only results with labels, store at TrainingData tuples
        all_data = sentences_with_embedding.map_partitions(
            filter_sentences_with_embedding)
        print("Checkpoint: all_data")
        checkpoint(
            all_data,
Ejemplo n.º 11
0
class dask_controller:  #adapted from Charles' code
    def __init__(self,n_workers=6,local=True,queue="short",\
                 walltime='01:30:00',cores=1,processes=1,memory='6GB',job_extra=[]):
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.processes = processes
        self.memory = memory
        self.cores = cores
        self.job_extra = job_extra

    def writedir(self, directory):
        if not os.path.exists(directory):
            os.makedirs(directory)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\
                                   processes=self.processes,memory=self.memory,
                                  cores=self.cores,job_extra=self.job_extra)
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def shutdown(self):
        self.daskcluster.stop_all_jobs()
        for item in os.listdir("./"):
            if "worker-" in item or "slurm-" in item or ".lock" in item:
                path = "./" + item
                if os.path.isfile(path):
                    os.remove(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)

    def printprogress(self):
        complete = len(
            [item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def mapfovs(self, function, fov_list, retries=0):
        self.function = function
        self.retries = retries

        def mapallfovs(fov_number, function=function):
            function(fov_number)

        self.futures = {}
        for fov in fov_list:
            future = self.daskclient.submit(mapallfovs, fov, retries=retries)
            self.futures[fov] = future

    def retry_failed(self):
        self.failed_fovs = [
            fov for fov, future in self.futures.items()
            if future.status != 'finished'
        ]
        self.daskclient.restart()
        time.sleep(5)
        self.mapfovs(self.function, self.failed_fovs, retries=self.retries)

    def retry_processing(self):
        self.proc_fovs = [
            fov for fov, future in self.futures.items()
            if future.status == 'pending'
        ]
        self.daskclient.restart()
        time.sleep(5)
        self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
Ejemplo n.º 12
0
class dask_controller:  #adapted from Charles' code
    def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\
                 walltime='01:30:00',cores=1,processes=1,memory='6GB',\
                 working_directory="./",job_extra=[]):
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.death_timeout = death_timeout
        self.processes = processes
        self.memory = memory
        self.cores = cores
        self.working_directory = working_directory
        self.job_extra = job_extra

        writedir(working_directory, overwrite=False)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\
                                   processes=self.processes,memory=self.memory,\
                                  cores=self.cores,local_directory=self.working_directory,\
                                log_directory=self.working_directory,job_extra=self.job_extra)
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def shutdown(self):
        self.daskclient.restart()
        if not self.local:
            self.daskcluster.stop_all_jobs()
            self.daskcluster.close()
        for item in os.listdir(self.working_directory):
            if "worker-" in item or "slurm-" in item or ".lock" in item:
                path = "./" + item
                if os.path.isfile(path):
                    os.remove(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)

    def printprogress(self):
        complete = len(
            [item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def displaydashboard(self):
        link = self.daskcluster.dashboard_link
        display(HTML('<a href="' + link + '">Dashboard</a>'))

    def mapfovs(self, function, fov_list, retries=0):
        self.function = function
        self.retries = retries

        def mapallfovs(fov_number, function=function):
            function(fov_number)

        self.futures = {}
        for fov in fov_list:
            future = self.daskclient.submit(mapallfovs, fov, retries=retries)
            self.futures[fov] = future

    def retry_failed(self):
        self.failed_fovs = [
            fov for fov, future in self.futures.items()
            if future.status != 'finished'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.failed_fovs, retries=self.retries)

    def retry_processing(self):
        self.proc_fovs = [
            fov for fov, future in self.futures.items()
            if future.status == 'pending'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
Ejemplo n.º 13
0
class LightGBMDaskLocal:
    # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb
    """
    persist call: data = self.client.persist(data)
    (assignment replaces old lazy array, as persist does not change the
    input in-place)

    To reduce the risk of hitting memory limits,
    consider restarting each worker process before running any data loading or training code.
    self.client.restart()
        - This function will restart each of the worker processes, clearing out anything
        they’re holding in memory. This function does NOT restart the actual machines of
        your cluster, so it runs very quickly.
        - should the workers just be killed regardless of whether the whole process
        was successful or unsuccessful (sort of a clean up action)? can restarting
        be that cleanup action?

    loop over hyperparameter values (method that accepts hyperparameters as a dictionary -
        initializes self.model = DaskLGBMRegressor() with each set of parameters and
        calls the method that loops over )
    loop over train-valdation sets
    run model's fit method and compute predicted values and RMSE
    """
    def __init__(
        self,
        curr_dt_time,
        n_workers,
        s3_path,
        startmonth,
        n_months_in_first_train_set,
        n_months_in_val_set,
        frac=None,
    ):
        self.curr_dt_time = curr_dt_time
        self.startmonth = startmonth
        self.n_months_in_first_train_set = n_months_in_first_train_set
        self.n_months_in_val_set = n_months_in_val_set
        self.frac = frac if frac is not None else 1.0

        cluster = LocalCluster(n_workers=n_workers)
        self.client = Client(cluster)
        self.client.wait_for_workers(n_workers)
        print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}")
        # self.pca_transformed = ___ # call PCA code that returns numpy array here
        # (rename self.pca_transformed to self.full_dataset)
        # numpy array can also be created from the saved (pickle) file

        # for data:
        # instead of first looping over hyperparameter values and then over different
        # train-validation sets, is it better to do it in the opposite order
        # to allow for one set of train-validation data to be created only once?

        try:
            # this commented out code did not work without the meta= argument,
            # meta= was not tried as it needs all other columns listed, in
            # addition to the ones being recast
            # self.full_dataset = self.client.persist(
            #     dd.read_parquet(
            #         s3_path, index=False, engine="pyarrow"
            #     )
            #     .sample(frac=self.frac, random_state=42)
            #     .map_partitions(
            #         self.cast_types,
            #         meta={
            #             'sid_shop_item_qty_sold_day': 'i2',
            #             **{f'cat{n}': 'i2' for n in range(1,23)}
            #         }
            #     )
            #     .map_partitions(self.drop_neg_qty_sold)
            #     .set_index(
            #         "sale_date", sorted=False, npartitions="auto"
            #     )
            #     .repartition(partition_size="100MB")
            # )

            # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster
            self.full_dataset = dd.read_parquet(s3_path,
                                                index=False,
                                                engine="pyarrow").sample(
                                                    frac=self.frac,
                                                    random_state=42)
            self.full_dataset["sale_date"] = self.full_dataset[
                "sale_date"].astype("datetime64[ns]")
            self.full_dataset[
                "sid_shop_item_qty_sold_day"] = self.full_dataset[
                    "sid_shop_item_qty_sold_day"].astype("int16")
            for col in self.full_dataset:
                if col.startswith("cat"):
                    self.full_dataset[col] = self.full_dataset[col].astype(
                        "int16")

            logging.debug(
                f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}"
            )
            self.full_dataset = self.full_dataset[
                self.full_dataset.sid_shop_item_qty_sold_day >= 0]
            # call dataframe.set_index(), then repartition, then persist
            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html
            # set_index(sorted=False, npartitions='auto')
            # df = df.repartition(npartitions=df.npartitions // 100)

            # self.full_dataset = self.client.persist(self.full_dataset)
            # _ = wait([self.full_dataset])

            # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html
            # self.full_dataset = self.full_dataset.repartition(partition_size="100MB")
            self.full_dataset = self.full_dataset.set_index(
                "sale_date",
                sorted=False,
                npartitions="auto",
                partition_size=100_000_000,
            )
            # partition_size for set_index: int, optional, desired size of
            # eaach partition in bytes (to be used with npartitions='auto')

            self.full_dataset = self.cull_empty_partitions(self.full_dataset)

            self.full_dataset = self.client.persist(self.full_dataset)
            _ = wait([self.full_dataset])
            logging.debug(
                f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}"
            )
            logging.debug(
                f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}"
            )
            logging.debug(
                f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}"
            )

        except Exception:
            logging.exception(
                "Exception occurred while creating Dask dataframe and persisting it on the cluster."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

        # finally:
        #     self.client.restart()
        #     sys.exit(1)

        # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask
        # Parquet datasets can be saved into separate files.
        # Each file may contain separate row groups.
        # Dask Dataframe reads each Parquet row group into a separate partition.

        # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE
        # DELETED AFTER DASK ARRAY IS CREATED
        # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID
        # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY
        # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION
        # SETS, SO WHAT'S THE BEST WAY TO DO THAT?
        # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR
        # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE?
        # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES -
        # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME
        # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL
        # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW
        # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT?
        # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE
        # https://github.com/dask/distributed/issues/1676 -
        # "You should also be aware that the task/data model underlying dask
        # arrays is immutable. You should never try to modify memory in-place.")
        # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING,
        # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION

        # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED /
        # FROM_DELAYED
        # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array)

        # can I use a function to read multiple files into one Dask array?

        # either figure out how to read multiple files (saved on S3) into one
        # Dask array, or
        # figure out how to save one array of PCA results to S3 (need disk space
        # to save it locally before transfer to S3 and need a method that can
        # handle transfer of more than 5GB - multipart transfer to S3)

        # try to write PCA-transformed data directly to zarr array (stored in memory)
        # then upload it to S3 (directly from memory)
        # then create dask array from that zarr array in S3

        # try to write PCA-transformed data to xarray then upload it to S3 as zarr

        # save numpy array to parquet file, upload that file to S3 (using upload_file),
        # then read that file into a Dask dataframe
        # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library?
        # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb)
        # df = dd.read_parquet('s3://bucket/my-parquet-data')
        # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet)
        # from above link:
        # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used.
        # read partitioned parquet dataset with Dask:
        # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset

    # def cast_types(self, df):
    #     df = df.copy()
    #     df['sale_date'] = df["sale_date"].astype(
    #         "datetime64[ns]"
    #     )
    #     for col in df:
    #         if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"):
    #             df[col] = df[col].astype("int16")
    #     return df
    #
    # def drop_neg_qty_sold(self, df):
    #     return df[df.sid_shop_item_qty_sold_day >= 0].copy()

    # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
    def cull_empty_partitions(self, ddf):
        ll = list(ddf.map_partitions(len).compute())
        ddf_delayed = ddf.to_delayed()
        ddf_delayed_new = list()
        pempty = None
        for ix, n in enumerate(ll):
            if 0 == n:
                pempty = ddf.get_partition(ix)
            else:
                ddf_delayed_new.append(ddf_delayed[ix])
        if pempty is not None:
            ddf = dd.from_delayed(ddf_delayed_new, meta=pempty)
        return ddf

    def gridsearch_wfv(self, params):
        # self.hyperparameters = hyperparameters
        # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in
        # the self.hyper_dict dictionary with value containing list of RMSE values
        self.all_params_combs = list()
        # determine if there is more than one combination of hyperparameters
        # if only one combination, set get_stats_ flag to True
        self.get_stats_ = (len(params[max(params,
                                          key=lambda x: len(params[x]))]) == 1)
        for params_comb_dict in (dict(
                zip(params.keys(),
                    v)) for v in list(product(*list(params.values())))):
            # for self.hyper_dict in hyperparameters:
            # self.params_combs_list.append(params_comb_dict)
            self.params_comb_dict = params_comb_dict.copy()
            self.params_comb_dict["rmse_list_"] = list()
            self.params_comb_dict["monthly_rmse_list_"] = list()
            self.params_comb_dict["fit_times_list_"] = list()
            try:
                self.model = lgb.DaskLGBMRegressor(
                    client=self.client,
                    random_state=42,
                    silent=False,
                    tree_learner="data",
                    force_row_wise=True,
                    **params_comb_dict,
                )
            except Exception:
                logging.exception(
                    "Exception occurred while initializing Dask model.")
                # kill all active work, delete all data on the network, and restart the worker processes.
                self.client.restart()
                sys.exit(1)

            # call method that loops over train-validation sets
            with performance_report(
                    filename=f"dask_report_{self.curr_dt_time}.html"):
                for train, test, get_stats in self.train_test_time_split():
                    self.fit(train).predict(test).rmse_all_folds(
                        test, get_stats)

            self.params_comb_dict["avg_rmse_"] = mean(
                self.params_comb_dict["rmse_list_"])
            self.params_comb_dict["monthly_avg_rmse_"] = mean(
                self.params_comb_dict["monthly_rmse_list_"])
            self.all_params_combs.append(self.params_comb_dict)

        best_params = min(self.all_params_combs,
                          key=lambda x: x["monthly_avg_rmse_"])
        self.best_score_ = best_params["monthly_avg_rmse_"]
        # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.)
        self.best_params_ = {
            k: v
            for k, v in best_params.items() if k in params
        }

        # save list of parameter-result dictionaries to dataframe and then to CSV
        if self.all_params_combs:
            all_params_combs_df = pd.DataFrame(self.all_params_combs)
            output_csv = "all_params_combs.csv"
            all_params_combs_df.to_csv(output_csv, index=False)

            try:
                key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv"
                # global s3_client
                s3_client = boto3.client("s3")
                response = s3_client.upload_file(output_csv,
                                                 "sales-demand-data", key)
                logging.info(
                    "Name of CSV uploaded to S3 and containing all parameter combinations "
                    f"and results is: {key}")
            except ClientError as e:
                logging.exception(
                    "CSV file with LightGBM parameter combinations and results was not copied to S3."
                )

        else:
            logging.debug(
                "List of parameter-result dictionaries is empty and was not converted to CSV!"
            )

            # probably do the opposite:
            # loop over train-validation splits (persisting that data in memory)
            # and run different models on one
            # split, saving the results that can later be aggregated

            # is it possible to read the full range of dates needed for time
            # series validation and then drop/delete rows from array or
            # move some rows to another array:
            # start with July-September (train) + October (validation),
            # then remove October and move September from train to validation

    # def time_split(self):
    #     return (
    #         self.full_dataset.loc[:self.end_date],
    #         self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)]
    #         # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)]
    #         # less than or equal to last day of month currently used for validation
    #     )

    def train_test_time_split(self):
        # first (earliest) month: July 2015
        # number of months in first train set: 1
        # number of months in validation set: 2
        #
        # number of months between Oct 2015 and July 2015: 3
        # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval)
        # (where 2 is the number of months in validation set)

        # (3 - n_months_in_first_train_set + 1) - (2 - 1)
        n_val_sets = (
            month_counter(
                self.startmonth)  # self.startmonth is e.g. July 1, 2015
            - self.n_months_in_first_train_set +
            1) - (self.n_months_in_val_set - 1)

        for m in range(n_val_sets):
            end_date = self.startmonth + relativedelta(
                months=m + self.n_months_in_first_train_set - 1, day=31)
            if self.get_stats_:
                get_stats = m == n_val_sets - 1
            else:
                get_stats = False
            yield (self.full_dataset.loc[:end_date], self.full_dataset.
                   loc[end_date + timedelta(days=1):end_date +
                       relativedelta(months=self.n_months_in_val_set, day=31)],
                   get_stats)
            # self.train, self.test = self.time_split(self.full_dataset, self.end_date)

    def get_sample_weights(self, train):
        weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array(
            lengths=True).astype('float32')
        weights_arr = da.where(weights_arr == 0,
                               self.params_comb_dict['weight_for_zeros'], 1.)
        return weights_arr

    def fit(self, train):
        try:
            start_time = time.perf_counter()
            logging.debug(
                f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}"
            )
            logging.debug(
                f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}")
            self.model.fit(
                train[[col for col in train if col.startswith(("pc", "cat"))
                       ]].to_dask_array(lengths=True),
                train["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True),
                sample_weight=self.get_sample_weights(train),
                feature_name=[
                    col for col in train if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in train if col.startswith("cat")
                ],
            )
            assert self.model.fitted_
            self.params_comb_dict["fit_times_list_"].append(
                time.perf_counter() - start_time)

            return self

        except Exception:
            logging.exception(
                "Exception occurred while fitting model on train data during walk-forward validation."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def predict(self, test):
        try:
            self.y_pred = self.model.predict(
                test[[col for col in test if col.startswith(("pc", "cat"))]])
            return self
        except Exception:
            logging.exception(
                "Exception occurred while computing predicted values on the test data."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def rmse_all_folds(self, test, get_stats):
        try:
            # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}")
            # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}")
            # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}")
            # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}")
            self.params_comb_dict["rmse_list_"].append(
                calc_rmse(
                    test["sid_shop_item_qty_sold_day"].to_dask_array(
                        lengths=True),
                    self.y_pred.compute_chunk_sizes(),
                    get_stats,
                ))
            # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred))

            self.params_comb_dict["monthly_rmse_list_"].append(
                calc_monthly_rmse(
                    test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]],
                    self.y_pred,
                ))

        except Exception:
            logging.exception(
                "Exception occurred while computing RMSE on the test data.")
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)

    def refit_and_save(self, model_path):
        """
        https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076
        """
        try:
            self.best_model = lgb.DaskLGBMRegressor(
                client=self.client,
                random_state=42,
                silent=False,
                tree_learner="data",
                force_row_wise=True,
                **self.best_params_,
            )
            self.best_model.fit(
                self.full_dataset[[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ]].to_dask_array(lengths=True),
                self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array(
                    lengths=True, ),
                sample_weight=self.get_sample_weights(self.full_dataset),
                feature_name=[
                    col for col in self.full_dataset
                    if col.startswith(("pc", "cat"))
                ],
                categorical_feature=[
                    col for col in self.full_dataset if col.startswith("cat")
                ],
            )
            output_txt = str(model_path).split("/")[-1]
            booster = self.best_model.booster_.save_model(output_txt)

            # output_txt = str(model_path).split('/')[-1]
            # global s3_client
            s3_client = boto3.client("s3")
            response = s3_client.upload_file(output_txt, "sales-demand-data",
                                             output_txt)
            logging.info(
                f"Name of saved model uploaded to S3 is: {output_txt}")

        except (Exception, ClientError):
            logging.exception(
                "Exception occurred while fitting model on the full dataset and saving the booster to file on S3."
            )
            # kill all active work, delete all data on the network, and restart the worker processes.
            self.client.restart()
            sys.exit(1)
Ejemplo n.º 14
0
def beta_parallel_disk_detection(dataset, 
                            probe,
                            #rxmin=None, # these would allow selecting a sub section 
                            #rxmax=None,
                            #rymin=None,
                            #rymax=None,
                            #qxmin=None,
                            #qxmax=None,
                            #qymin=None,
                            #qymax=None,
                            probe_type="FT",
                            dask_client= None,
                            dask_client_params:dict=None,
                            restart_dask_client=True,
                            close_dask_client=False,
                            return_dask_client=True,
                            *args, **kwargs):
    """
    This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. 

    This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets 
    
    There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. 
    If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params.
    If no dask_client arguments are passed it will create a dask_client for a local machine 
    
    Note:
        Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash.
    Args:
        dataset (py4dSTEM datacube): 4DSTEM dataset
        probe (ndarray): can be regular probe kernel or fourier transormed
        probe_type (str): "FT" or None 
        dask_client (distributed.client.Client): dask client
        dask_client_params (dict): parameters to pass to dask client or dask cluster
        restart_dask_client (bool): if True, function will attempt to restart the dask_client.
        close_dask_client (bool): if True, function will attempt to close the dask_client.
        return_dask_client (bool): if True, function will return the dask_client.
        *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary...

    Returns:
        peaks (PointListArray): the Bragg peak positions and the correlenation intensities
        dask_client(optional) (distributed.client.Client): dask_client for use later.
    """
    #TODO add asserts abotu peaks not being passed
    # Dask Client stuff
    #TODO how to guess at default params for client, sqrt no.cores.  Something to do with the size of the diffraction patterm
    # write a function which can do this. 
    #TODO replace dask part with a with statement for easier clean up e.g.
    # with LocalCluser(params) as cluster, Client(cluster) as client: 
    #   ... dask stuff. 
    #TODO add assert statements and other checks. Think about reordering opperations
    
    if dask_client == None: 
        if dask_client_params !=None:

            dask.config.set({'distributed.worker.memory.spill': False,
                'distributed.worker.memory.target': False}) 
            cluster = LocalCluster(**dask_client_params)
            dask_client = Client(cluster, **dask_client_params)
        else:
            # AUTO MAGICALLY SET?
            # LET DASK SET?
            # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE?
            # psutil could be used to count cores. 
            dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk
                'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out
            cluster = LocalCluster()
            dask_client = Client(cluster)

    else:
        assert type(dask_client) == distributed.client.Client
        if restart_dask_client:
            try:
                dask_client.restart()
            except Exception as e:
                print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT
                return e 
        else:
            pass


    # Probe stuff
    assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched"
    if probe_type != "FT":
    #TODO clean up and pull out redudant parts
    #if probe.dtype != (np.complex128 or np.complex64 or np.complex256):
        #DO FFT SHIFT THING
        probe_kernel_FT = np.conj(np.fft.fft2(probe))
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()
        # delayed_probe_kernel_FT = delayed(probe_kernel_FT)
    else:
        probe_kernel_FT = probe
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()

    # GET DATA 
    #TODO add another elif if it is a dask array then pass
    if type(dataset.data) == np.ndarray:
        dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    elif dataset.stack_pointer != None:
        dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    else: 
        print("Couldn't access the data")
        return None

    # Convert the data to delayed 
    dataset_delayed = dask_data.to_delayed()
    # TODO Trim data e.g. rx,ry,qx,qy
    # I can pass the index values in here I should trim the probe and diffraction pattern first


    # Into the meat of the function 
    
    # create an empty list to which we will append the dealyed functions to. 
    res = []
    # loop over the dataset_delayed and create a delayed function of 
    for x in np.ndindex(dataset_delayed.shape):
        temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x],
                                probe_kernel_FT=dask_probe_delayed[0,0],
                                #probe_kernel_FT=delayed_probe_kernel_FT,
                                *args, **kwargs) #passing through args from earlier or should I use 
                                #corrPower=corrPower,
                                #sigma=sigma_gaussianFilter,
                                #edgeBoundary=edgeBoundary,
                                #minRelativeIntensity=minRelativeIntensity,
                                #minPeakSpacing=minPeakSpacing,        
                                #maxNumPeaks=maxNumPeaks,
                                #subpixel='poly')
        res.append(temp)
    _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing 

    output = dask_client.gather(_temp_peaks) # gather the future objects 

    coords = [('qx',float),('qy',float),('intensity',float)]
    peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2])

    #temp_peaks[0][0]

    # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry),
    for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])):
        #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count])
        #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0])
        peaks.get_pointlist(rx, ry).add_pointlist(output[count])

    # Clean up
    dask_client.cancel(_temp_peaks) # removes from the dask workers
    del _temp_peaks # deletes the object 
    if close_dask_client:
        dask_client.close()
        return peaks
    elif close_dask_client == False and return_dask_client == True:
        return peaks, dask_client
    elif close_dask_client and return_dask_client == False:
        return peaks
    else:
        print('Dask Client in unknown state, this may result in unpredicitable behaviour later')
        return peaks
Ejemplo n.º 15
0
def main(args):
    """Main function of cellanneal."""
    if (args.start_temp is not None
            or args.end_temp is not None) and args.auto_temp == 1:
        raise Exception(
            "when auto_temp is set to 1(default value), starting temperature or ending temperature should not be set manually"
        )

    if not args.no_parallel:
        import dask
        from dask.distributed import Client, LocalCluster
        if not args.cluster:
            cluster = LocalCluster(
                n_workers=args.workers,
                threads_per_worker=1,
            )
            client = Client(cluster)
        else:
            cluster = args.cluster
            client = Client(cluster)
            client.restart()
            cwd = Path(__file__).parent.absolute()
            client.upload_file(cwd / 'drawing.py')
            client.upload_file(cwd / 'mathhelper.py')
            client.upload_file(cwd / 'cell.py')
            client.upload_file(cwd / 'colony.py')
            client.upload_file(cwd / 'optimization.py')
            client.upload_file(cwd / 'drawing.py')
            client.upload_file(cwd / 'global_optimization.py')
            client.upload_file(cwd / 'main.py')
    else:
        client = None

    lineagefile = None
    start = time.time()

    try:
        config = load_config(args.config)

        simulation_config = config["simulation"]
        #Maybe better to store the image type in the config file in the first place, instead of using cmd?
        if args.graySynthetic == True:
            simulation_config["image.type"] = "graySynthetic"
        elif args.phaseContrast == True:
            simulation_config["image.type"] = "phaseContrastImage"
        elif args.binary == True:
            simulation_config["image.type"] = "binary"
        else:
            raise ValueError(
                "Invalid Command: Synthetic image type must be specified")

        if not args.output.is_dir():
            args.output.mkdir()
        if not args.bestfit.is_dir():
            args.bestfit.mkdir()
        if args.residual and not args.residual.is_dir():
            args.residual.mkdir()

        seed = int(start * 1000) % (2**32)
        if args.seed != None:
            seed = args.seed
        np.random.seed(seed)
        print("Seed: {}".format(seed))

        celltype = config['global.cellType'].lower()

        # setup the colony from a file with the initial properties
        lineageframes = LineageFrames()
        colony = lineageframes.forward()
        imagefiles = get_inputfiles(args)
        if args.lineage_file:
            load_colony(colony,
                        args.lineage_file,
                        config,
                        initial_frame=imagefiles[0].name)
        else:
            load_colony(colony, args.initial, config)
        cost_diff = (-1, -1)

        # open the lineage file for writing
        lineagefile = open(args.output / 'lineage.csv', 'w')
        header = ['file', 'name']
        if celltype == 'bacilli':
            header.extend([
                'x', 'y', 'width', 'length', 'rotation', "split_alpha",
                "opacity"
            ])
        print(','.join(header), file=lineagefile)

        if args.debug:
            with open(args.debug / 'debug.csv', 'w') as debugfile:
                print(','.join([
                    'window_start', 'window_end', 'pbad_total', 'bad_count',
                    'temperature', 'total_cost_diff', 'current_iteration',
                    'total_iterations'
                ]),
                      file=debugfile)

        if args.global_optimization:
            global useDistanceObjective

            useDistanceObjective = args.dist
            realimages = [
                optimization.load_image(imagefile) for imagefile in imagefiles
            ]
            window = config["global_optimizer.window_size"]
            if args.lineage_file:
                lineage = global_optimization.build_initial_lineage(
                    imagefiles, args.lineage_file, args.continue_from,
                    config["simulation"])
            else:
                lineage = global_optimization.build_initial_lineage(
                    imagefiles, args.initial, args.continue_from,
                    config["simulation"])
            lineage = global_optimization.find_optimal_simulation_confs(
                imagefiles, lineage, realimages, args.continue_from)
            sim_start = args.continue_from - args.frame_first
            print(sim_start)
            shape = realimages[0].shape
            synthimages = []
            cellmaps = []
            distmaps = []
            iteration_per_cell = config["iteration_per_cell"]
            if not useDistanceObjective:
                distmaps = [None] * len(realimages)
            for window_start in range(1 - window, len(realimages)):
                window_end = window_start + window
                print(window_start, window_end)
                if window_end <= len(realimages):
                    # get initial estimate
                    if window_start >= sim_start:
                        if window_end > 1:
                            lineage.copy_forward()
                    realimage = realimages[window_end - 1]
                    synthimage, cellmap = optimization.generate_synthetic_image(
                        lineage.frames[window_end - 1].nodes, shape,
                        lineage.frames[window_end - 1].simulation_config)
                    synthimages.append(synthimage)
                    cellmaps.append(cellmap)
                    if useDistanceObjective:
                        distmap = distance_transform_edt(realimage < .5)
                        distmap /= config[
                            f'{config["global.cellType"].lower()}.distanceCostDivisor'] * config[
                                'global.pixelsPerMicron']
                        distmap += 1
                        distmaps.append(distmap)
                    if args.auto_temp == 1 and window_end == 1:
                        print("auto temperature schedule started")
                        args.start_temp, args.end_temp = \
                            global_optimization.auto_temp_schedule(imagefiles, lineage, realimages, synthimages, cellmaps, distmaps, 0, 1, lineagefile, args, config)
                        print("auto temperature schedule finished")
                        print("starting temperature is ", args.start_temp,
                              "ending temperature is ", args.end_temp)
                    if args.auto_meth == "frame" and optimization.auto_temp_schedule_frame(
                            window_end, 3):
                        print("auto temperature schedule restarted")
                        args.start_temp, args.end_temp = \
                            global_optimization.auto_temp_schedule(imagefiles, lineage, realimages, synthimages, cellmaps, distmaps, window_start, window_end, lineagefile, args, config)
                        print("auto temperature schedule finished")
                        print("starting temperature is ", args.start_temp,
                              "ending temperature is ", args.end_temp)
                if window_start >= sim_start:
                    if useDistanceObjective:
                        global_optimization.totalCostDiff = optimization.dist_objective(
                            realimage, synthimage, distmap, cellmap,
                            config["overlap.cost"])
                    else:
                        global_optimization.totalCostDiff = optimization.objective(
                            realimage, synthimage, cellmap,
                            config["overlap.cost"], config["cell.importance"])
                    lineage, synthimages, distmaps, cellmaps = global_optimization.optimize(
                        imagefiles,
                        lineage,
                        realimages,
                        synthimages,
                        cellmaps,
                        distmaps,
                        window_start,
                        window_end,
                        lineagefile,
                        args,
                        config,
                        iteration_per_cell,
                        client=client)
                if window_start >= 0:
                    global_optimization.save_lineage(
                        imagefiles[window_start].name,
                        lineage.frames[window_start].nodes, lineagefile)
                    global_optimization.save_output(
                        imagefiles[window_start].name,
                        synthimages[window_start], realimages[window_start],
                        lineage.frames[window_start].nodes, args, config)
            return 0

        config["simulation"] = optimization.find_optimal_simulation_conf(
            config["simulation"], optimization.load_image(imagefiles[0]),
            list(colony))
        if args.auto_temp == 1:
            print("auto temperature schedule started")
            args.start_temp, args.end_temp = optimization.auto_temp_schedule(
                imagefiles[0], lineageframes.forward(), args, config)
            print("auto temperature schedule finished")
            print("starting temperature is ", args.start_temp,
                  "ending temperature is ", args.end_temp)

        frame_num = 0
        prev_cell_num = len(colony)
        for imagefile in imagefiles:  # Recomputing temperature when needed

            frame_num += 1

            if args.auto_meth == "frame":
                if optimization.auto_temp_schedule_frame(frame_num, 8):
                    print("auto temperature schedule started (recomputed)")
                    args.start_temp, args.end_temp = optimization.auto_temp_schedule(
                        imagefile, colony, args, config)
                    print("auto temperature schedule finished")
                    print("starting temperature is ", args.start_temp,
                          "ending temperature is ", args.end_temp)

            elif args.auto_meth == "factor":
                if optimization.auto_temp_schedule_factor(
                        len(colony), prev_cell_num, 1.1):
                    print("auto temperature schedule started (recomputed)")
                    args.start_temp, args.end_temp = optimization.auto_temp_schedule(
                        imagefile, colony, args, config)
                    print("auto temperature schedule finished")
                    print("starting temperature is ", args.start_temp,
                          "ending temperature is ", args.end_temp)
                    prev_cell_num = len(colony)

            elif args.auto_meth == "const":
                if optimization.auto_temp_schedule_const(
                        len(colony), prev_cell_num, 10):
                    print("auto temperature schedule started (recomputed)")
                    args.start_temp, args.end_temp = optimization.auto_temp_schedule(
                        imagefile, colony, args, config)
                    print("auto temperature schedule finished")
                    print("starting temperature is ", args.start_temp,
                          "ending temperature is ", args.end_temp)
                    prev_cell_num = len(colony)

            elif args.auto_meth == "cost":
                print(cost_diff, frame_num,
                      optimization.auto_temp_shcedule_cost(cost_diff))
                if frame_num >= 2 and optimization.auto_temp_shcedule_cost(
                        cost_diff):
                    print(
                        "auto temperature schedule started cost_diff (recomputed)"
                    )
                    args.start_temp, args.end_temp = optimization.auto_temp_schedule(
                        imagefile, colony, args, config)
                    print("auto temperature schedule finished")
                    print("starting temperature is ", args.start_temp,
                          "ending temperature is ", args.end_temp)

            colony = optimize(imagefile, lineageframes, args, config, client)

            cost_diff = optimization.update_cost_diff(colony, cost_diff)

            # flatten modifications and save cell properties

            colony.flatten()
            for cellnode in colony:
                properties = [imagefile.name, cellnode.cell.name]
                if celltype == 'bacilli':
                    properties.extend([
                        str(cellnode.cell.x),
                        str(cellnode.cell.y),
                        str(cellnode.cell.width),
                        str(cellnode.cell.length),
                        str(cellnode.cell.rotation)
                    ])
                print(','.join(properties), file=lineagefile)

    except KeyboardInterrupt as error:
        raise error
    finally:
        if lineagefile:
            lineagefile.close()

    print(f'{time.time() - start} seconds')
    if client and not cluster:
        client.shutdown()

    return 0
Ejemplo n.º 16
0
    def handle(self, *args, **options):
        # Unpack variables
        product = options['product']
        model = options['model']
        name = options['name']
        training = options['training']
        sp = options['spatial_aggregation']
        kwargs = parser_extra_args(options['extra_kwargs'])
        categorical_variables = options['categorical_variables']
        sample = options['sample']
        filename = options['filename']
        scheduler_file = options['scheduler']
        remove_outliers = options['remove_outliers']

        # Prepare encoding of categorical variables if any specified
        if categorical_variables is not None:
            kwargs.update(categorical_features=var_to_ind(categorical_variables))

        # Load model class
        if filename is None:
            try:
                module = import_module('madmex.modeling.supervised.%s' % model)
                Model = module.Model
            except ImportError as e:
                raise ValueError('Invalid model argument')

        # datacube query
        gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']}
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run 
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(extract_tile_db,
                       iterable,
                       pure=False,
                       **{'sp': sp,
                          'training_set': training,
                          'sample': sample})
        arr_list = client.gather(C)

        logger.info('Completed extraction of training data from %d tiles' , len(arr_list))

        # Zip list of predictors, target into two lists
        X_list, y_list = zip(*arr_list)

        # Filter Nones
        X_list = [x for x in X_list if x is not None]
        y_list = [x for x in y_list if x is not None]

        # Concatenate the lists
        X = np.concatenate(X_list)
        y = np.concatenate(y_list)

        # Optionally run outliers removal
        if remove_outliers:
            X, y = Model.remove_outliers(X, y)

        # Optionally write the arrays to pickle file
        if filename is not None:
            logger.info('Writting X and y arrays to pickle file, no model will be fitted')
            with open(filename, 'wb') as dst:
                pickle.dump((X, y), dst)

        else:
            print("Fitting %s model for %d observations" % (model, y.shape[0]))

            # Fit model
            mod = Model(**kwargs)
            mod.fit(X, y)
            # Write the fitted model to the database
            mod.to_db(name=name, recipe=product, training_set=training)
Ejemplo n.º 17
0
    def handle(self, *args, **options):
        path = os.path.join(INGESTION_PATH, 'recipes', options['name'])
        if not os.path.exists(path):
            os.makedirs(path)
        # Prepare a few variables
        try:
            recipe_meta = RECIPES[options['recipe']]
        except KeyError:
            raise ValueError('Selected recipe does not exist')
        product = recipe_meta['product']
        fun = recipe_meta['fun']
        yaml_file = recipe_meta['config_file']
        begin = datetime.strptime(options['begin'], '%Y-%m-%d')
        end = datetime.strptime(options['end'], '%Y-%m-%d')
        time = (begin, end)
        center_dt = mid_date(begin, end)
        scheduler_file = options['scheduler']

        # database query
        gwf_kwargs = {
            k: options[k]
            for k in ['lat', 'long', 'region', 'begin', 'end']
        }
        gwf_kwargs.update(product=product)
        iterable = gwf_query(**gwf_kwargs)

        # Start cluster and run
        client = Client(scheduler_file=scheduler_file)
        client.restart()
        C = client.map(fun,
                       iterable,
                       pure=False,
                       **{
                           'center_dt': center_dt,
                           'path': path
                       })
        nc_list = client.gather(C)
        n_tiles = len([x for x in nc_list if x is not None])
        logger.info('Processing done, %d tiles written to disk' % n_tiles)

        # Add product
        product_description = yaml_to_dict(yaml_file)
        pr, dt = add_product_from_yaml(yaml_file, options['name'])

        # Function to run on the list of filenames returned by Client.map()
        def index_nc_file(nc):
            """Helper function with tons of variables taken from the local environment
            """
            try:
                print("Adding %s to datacube database" % nc)
                metadict = metadict_from_netcdf(
                    file=nc,
                    description=product_description,
                    center_dt=center_dt,
                    from_dt=begin,
                    to_dt=end,
                    algorithm=options['recipe'])
                add_dataset(pr=pr, dt=dt, metadict=metadict, file=nc)
            except Exception as e:
                pass

        [index_nc_file(x) for x in nc_list]
def compute_stats(user_reviews_csv, products_csv):
    client = Client('127.0.0.1:8786')
    client = client.restart()

    rev_ids = user_reviews_csv['asin'].persist()
    id_list = products_csv['asin'].persist()

    # Q1
    prod_prop = (products_csv.isnull().mean() * 100).compute().round(2)
    rev_prop = (user_reviews_csv.isnull().mean() * 100).compute().round(2)

    # Q2
    corr_df = user_reviews_csv[['asin', 'overall']].merge(products_csv[['asin', 'price']], on='asin')
    corrSeries = corr_df[['overall', 'price']].corr().compute()

    # Q3
    priceStats = products_csv.price.describe(percentiles=[0.5]).compute()

    # Q4
    def category_eval(df):
        return df['categories'].apply(clean_category)

    def clean_category(val):
        try:
            return ast.literal_eval(val)[0][0]
        except:
            return val

    clean_cats = products_csv.map_partitions(category_eval)
    category_counts = clean_cats.value_counts().compute()

    # Q5

    def q5check_ids_exist(df, id_list):

        for i in df.to_frame().iterrows():
            id_to_check = list(i[1])[0]
            if not (id_to_check in id_list):
                return 1
        return 0

    q5_ans = q5check_ids_exist(rev_ids, id_list)

    # Q6

    def related_eval(df):
        return df['related'].apply(related_to_prod_list)

    def related_to_prod_list(related_dict):
        try:
            related_dict = ast.literal_eval(related_dict)
            return list(chain(*related_dict.values()))
        except:
            return related_dict

    def q6check_ids_exist(df, id_list):
        for i in df.iterrows():
            idList = list(i[1])[0]
            if type(idList) == list:
                for related_id in idList:
                    if not (related_id in id_list):
                        return 1
        return 0

    flatRelatedIds = products_csv.map_partitions(related_eval).to_frame()
    q6_ans = q6check_ids_exist(flatRelatedIds, id_list)

    out_dict = {"q1": {"products": dict(prod_prop), "reviews": dict(rev_prop)},
                "q2": corrSeries['overall']['price'].round(2),
                "q3": dict(priceStats.drop('count').round(2)),
                "q4": dict(category_counts),
                "q5": q5_ans,
                "q6": q6_ans}

    def convert(o):
        if isinstance(o, np.int64): return int(o)
        raise TypeError

    with open('results_1B.json', 'w') as outfile:
        json.dump(out_dict, outfile, default=convert)
Ejemplo n.º 19
0
def restart_workers(address=u'tcp://172.31.54.193:8786'):
    client = Client(address)
    client.restart()
    sleep(2)
    print(u'workers restarted.')
Ejemplo n.º 20
0
class GlideDock:
    """
    Score structures using Glide docking score, including ligand preparation with LigPrep
    """
    return_metrics = [
        'r_i_docking_score', 'r_i_glide_ligand_efficiency',
        'r_i_glide_ligand_efficiency_sa', 'r_i_glide_ligand_efficiency_ln',
        'r_i_glide_gscore', 'r_i_glide_lipo', 'r_i_glide_hbond',
        'r_i_glide_metal', 'r_i_glide_rewards', 'r_i_glide_evdw',
        'r_i_glide_ecoul', 'r_i_glide_erotb', 'r_i_glide_esite',
        'r_i_glide_emodel', 'r_i_glide_energy', 'r_i_glide_rmsd_to_input'
    ]

    def __init__(self,
                 prefix: str,
                 glide_template: os.PathLike,
                 cluster: str = None,
                 timeout: float = 120.0,
                 **kwargs):
        """
        :param prefix: Prefix to identify scoring function instance (e.g., DRD2)
        :param glide_template: Path to a template docking file (.in)
        :param cluster: Address to Dask scheduler for parallel processing via dask
        :param timeout: Timeout (seconds) before killing an individual docking simulation
        :param kwargs:
        """
        # Read in glide template (.in)
        with open(glide_template, 'r') as gfile:
            self.glide_options = gfile.readlines()
        # Make sure output file type is sdf
        self.glide_options = self.modify_glide_in(self.glide_options,
                                                  'POSE_OUTTYPE',
                                                  'ligandlib_sd')

        # Specify class attributes
        self.prefix = prefix.replace(" ", "_")
        self.glide_metrics = GlideDock.return_metrics
        self.glide_env = os.path.join(os.environ['SCHRODINGER'], 'glide')
        self.ligprep_env = os.path.join(os.environ['SCHRODINGER'], 'ligprep')
        self.timeout = float(timeout)
        self.cluster = cluster
        if self.cluster is not None:
            self.client = Client(self.cluster)
        self.variants = None
        self.docking_results = None

    @staticmethod
    def modify_glide_in(glide_in: str, glide_property: str, glide_value: str):
        """
        Convenience function to insert / overwrite certain .in file properties and values
        :param glide_in: A string of the .in file
        :param property: Property to be changed (e.g. POSE_OUTTYPE)
        :param value: Value to change to (e.g. ligandlib_sd)
        :return: Modified glide_in
        """
        # If property is already present, replace value
        if any(
            [True if glide_property in line else False for line in glide_in]):
            for i in range(len(glide_in)):
                if glide_property in glide_in[i]:
                    glide_in[i] = f'{glide_property}   {glide_value}\n'
                    break
        # Otherwise insert it before newline (separates properties from features)
        elif any([True if line == '\n' else False for line in glide_in]):
            for i in range(len(glide_in)):
                if glide_in[i] == '\n':
                    glide_in.insert(i, f'{glide_property}   {glide_value}\n')
                    break
        # Otherwise just stick it on the end of the file
        else:
            glide_in.append(f'{glide_property}  {glide_value}\n')

        return glide_in

    def run_ligprep(self, smiles: list):
        """
        Call ligprep to prepare molecules.
        :param smiles: List of SMILES strings
        """
        ligprep_commands = []
        # Write out smiles to sdf files and prepare ligprep commands
        for smi, name in zip(smiles, self.file_names):
            smi_in = os.path.join(self.directory, f'{name}.smi')
            sdf_out = os.path.join(self.directory, f'{name}_ligprep.sdf')

            with open(smi_in, 'w') as f:
                f.write(smi)

            command = " ".join(
                (self.ligprep_env, f"-ismi {smi_in}", f"-osd {sdf_out}",
                 "-ph 7.0", "-pht 1.0", "-bff 16", "-s 8", "-epik", "-WAIT",
                 "-NOJOBID"))
            ligprep_commands.append(command)

        # Initialize subprocess
        logger.debug('LigPrep called')
        p = timedSubprocess(timeout=self.timeout).run

        # Run commands either using Dask or sequentially
        if self.cluster is not None:
            futures = self.client.map(p, ligprep_commands)
            _ = self.client.gather(futures)
        else:
            _ = [p(command) for command in ligprep_commands]
        logger.debug('LigPrep finished')
        return self

    @property
    def split_sdf(self):
        """
        Split ligprep output sdf so that each variant can be independently run.
        """
        # Read in ligprep output files and split to individual variants
        self.variants = {name: [] for name in self.file_names}
        for name in self.file_names:
            out_file = os.path.join(self.directory, f'{name}_ligprep.sdf')

            if os.path.exists(out_file):
                supp = Chem.rdmolfiles.ForwardSDMolSupplier(os.path.join(
                    self.directory, f'{name}_ligprep.sdf'),
                                                            sanitize=False,
                                                            removeHs=False)

                for mol in supp:
                    if mol:
                        variant = mol.GetPropsAsDict()['s_lp_Variant']
                        if ':' in variant:
                            variant = variant.split(':')[1]
                        variant = variant.split('-')[1]
                        self.variants[name].append(variant)
                        w = Chem.rdmolfiles.SDWriter(
                            os.path.join(self.directory,
                                         f'{name}-{variant}_ligprep.sdf'))
                        w.write(mol)
                        w.flush()
                        w.close()
                        logger.debug(f'Split {name} -> {name}-{variant}')
                    else:
                        continue

        return self

    def run_glide(self):
        """
        Write GLIDE new input files and submit each to Glide
        """
        glide_commands = []
        for name in self.file_names:
            for variant in self.variants[name]:
                # Set some file paths
                glide_in = self.glide_options.copy()
                # Change glide_in file
                glide_in = self.modify_glide_in(
                    glide_in, 'LIGANDFILE',
                    os.path.join(self.directory,
                                 f'{name}-{variant}_ligprep.sdf'))
                glide_in = self.modify_glide_in(glide_in, 'OUTPUTDIR',
                                                os.path.join(self.directory))

                # Write new input file (.in)
                with open(os.path.join(self.directory, f'{name}-{variant}.in'),
                          'wt') as f:
                    [f.write(line) for line in glide_in]

                # Prepare command line command
                command = self.glide_env + ' -WAIT -NOJOBID -NOLOCAL ' + \
                          os.path.join(self.directory, f'{name}-{variant}.in')
                glide_commands.append(command)

        # Initialize subprocess
        logger.debug('Glide called')
        p = timedSubprocess(timeout=self.timeout).run

        if self.cluster is not None:
            futures = self.client.map(p, glide_commands)
            _ = self.client.gather(futures)
        else:
            _ = [p(command) for command in glide_commands]
        logger.debug('Glide finished')
        return self

    def get_docking_scores(self,
                           smiles: list,
                           return_best_variant: bool = False):
        """
        Read output sdfs, get output properties
        :param smiles: List of SMILES strings
        :param return_best_variant:
        :return optional, list of filenames with best variant
        """
        # Read in docked file
        best_variants = self.file_names.copy()
        best_score = {name: None for name in self.file_names}

        # For each molecule
        for i, (smi, name) in enumerate(zip(smiles, self.file_names)):
            docking_result = {'smiles': smi}

            # For each variant
            for variant in self.variants[name]:
                out_file = os.path.join(self.directory,
                                        f'{name}-{variant}_lib.sdfgz')

                if os.path.exists(out_file):

                    # Try to load it in, and grab the score
                    try:
                        with gzip.open(out_file) as f:
                            glide_out = Chem.ForwardSDMolSupplier(f)

                            for mol in glide_out:  # should just be one
                                dscore = mol.GetPropsAsDict(
                                )['r_i_docking_score']

                                # If molecule doesn't have a score yet append it and the variant
                                if best_score[name] is None:
                                    best_score[name] = dscore
                                    best_variants[i] = f'{name}-{variant}'
                                    docking_result.update({
                                        f'{self.prefix}_' + k: v
                                        for k, v in
                                        mol.GetPropsAsDict().items()
                                        if k in self.glide_metrics
                                    })
                                    logger.debug(
                                        f'Docking score for {name}-{variant}: {dscore}'
                                    )

                                # If docking score is better change it...
                                elif dscore < best_score[name]:
                                    best_score[name] = dscore
                                    best_variants[i] = f'{name}-{variant}'
                                    docking_result.update({
                                        f'{self.prefix}_' + k: v
                                        for k, v in
                                        mol.GetPropsAsDict().items()
                                        if k in self.glide_metrics
                                    })
                                    logger.debug(
                                        f'Found better {name}-{variant}: {dscore}'
                                    )

                                # Otherwise ignore
                                else:
                                    pass

                    # If parsing the molecule threw an error and nothing stored, append 0
                    except:
                        logger.debug(
                            f'Error processing {name}-{variant}_lib.sdfgz file'
                        )
                        if best_score[
                                name] is None:  # Only if no other score for prefix
                            best_variants[i] = f'{name}-{variant}'
                            docking_result.update({
                                f'{self.prefix}_' + k: 0.0
                                for k in self.glide_metrics
                            })
                            logger.debug(
                                f'Returning 0.0 unless a successful variant is found'
                            )

                # If path doesn't exist and nothing store, append 0
                else:
                    logger.debug(f'{name}-{variant}_lib.sdfgz does not exist')
                    if best_score[
                            name] is None:  # Only if no other score for prefix
                        best_variants[i] = f'{name}-{variant}'
                        docking_result.update({
                            f'{self.prefix}_' + k: 0.0
                            for k in self.glide_metrics
                        })
                        logger.debug(
                            f'Returning 0.0 unless a successful variant is found'
                        )

            # Add best variant information to docking result
            docking_result.update(
                {f'{self.prefix}_best_variant': best_variants[i]})
            self.docking_results.append(docking_result)

        logger.debug(f'Best scores: {best_score}')
        if return_best_variant:
            logger.debug(f'Returning best variants: {best_variants}')
            return best_variants

        return self

    def remove_files(self, keep: list = [], parallel: bool = True):
        """
        Remove some of the log files and molecule files.
        :param keep: List of filenames to keep pose files for.
        :param parallel: Whether to run using Dask (requires scheduler address during initialisation).
        """
        # If no cluster is provided ensure parallel is False
        if (parallel is True) and (self.cluster is None):
            parallel = False

        keep_poses = [f'{k}_lib.sdfgz' for k in keep]
        logger.debug(f'Keeping pose files: {keep_poses}')
        del_files = []
        for name in self.file_names:
            # Grab files
            files = glob.glob(os.path.join(self.directory, f'{name}*'))
            logger.debug(f'Glob found {len(files)} files')

            if len(files) > 0:
                try:
                    files = [
                        file for file in files if not ("log.txt" in file)
                        and not any([p in file for p in keep_poses])
                    ]

                    if parallel:
                        [del_files.append(file) for file in files]
                    else:
                        [os.remove(file) for file in files]
                # No need to stop if files can't be found and deleted
                except FileNotFoundError:
                    logger.debug('File not found.')
                    pass

        if parallel:
            futures = self.client.map(os.remove, del_files)
            _ = self.client.gather(futures)
        return self

    def __call__(self, smiles: list, directory: str, file_names: list,
                 **kwargs):
        """
        Calculate scores for GlideDock
        :param smiles: List of SMILES strings
        :param directory: Directory to save files and logs into
        :param file_names: List of corresponding file names for SMILES to match files to index
        :param kwargs: Ignored
        :return: List of dicts i.e. [{'smiles': smi, 'metric': 'value', ...}, ...]
        """
        # Assign some attributes
        step = file_names[0].split("_")[0]  # Assume first Prefix is step

        # Create log directory
        self.directory = os.path.join(os.path.abspath(directory), 'GlideDock',
                                      step)
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
        self.file_names = file_names
        self.docking_results = []  # make sure no carry over

        # Add logging file handler
        fh = logging.FileHandler(
            os.path.join(self.directory, f'{step}_log.txt'))
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)

        # Refresh Dask every few hundred iterations
        if self.cluster is not None:
            if int(step) % 250 == 0:
                self.client.restart()

        # Run protocol
        self.run_ligprep(smiles=smiles)
        self.split_sdf  # Catch any erroneous smiles with no output ligprep file
        self.run_glide()
        best_variants = self.get_docking_scores(smiles=smiles,
                                                return_best_variant=True)

        # Cleanup
        self.remove_files(keep=best_variants, parallel=True)
        fh.close()
        logger.removeHandler(fh)
        self.directory = None
        self.file_names = None
        self.variants = None

        # Check
        assert len(smiles) == len(self.docking_results)

        return self.docking_results
Ejemplo n.º 21
0
class Cluster(object):
    """
    Wrapper for ``dask`` clients

    Best practices:

       "By "node" people typically mean a physical or virtual machine. That node can run several programs or processes
        at once (much like how my computer can run a web browser and text editor at once). Each process can parallelize
        within itself with many threads. Processes have isolated memory environments, meaning that sharing data within
        a process is free, while sharing data between processes is expensive.

        Typically things work best on larger nodes (like 36 cores) if you cut them up into a few processes, each of
        which have several threads. You want the number of processes times the number of threads to equal the number
        of cores. So for example you might do something like the following for a 36 core machine:

            Four processes with nine threads each
            Twelve processes with three threads each
            One process with thirty-six threads

        Typically one decides between these choices based on the workload. The difference here is due to Python's
        Global Interpreter Lock, which limits parallelism for some kinds of data. If you are working mostly with
        Numpy, Pandas, Scikit-Learn, or other numerical programming libraries in Python then you don't need to worry
        about the GIL, and you probably want to prefer few processes with many threads each. This helps because it
        allows data to move freely between your cores because it all lives in the same process. However, if you're
        doing mostly Pure Python programming, like dealing with text data, dictionaries/lists/sets, and doing most of
        your computation in tight Python for loops then you'll want to prefer having many processes with few threads
        each. This incurs extra communication costs, but lets you bypass the GIL.

        In short, if you're using mostly numpy/pandas-style data, try to get at least eight threads or so in a process.
        Otherwise, maybe go for only two threads in a process."

        --MRocklin (https://stackoverflow.com/questions/51099685/best-practices-in-setting-number-of-dask-workers)

    Examples:
        >>> # I/O-heavy task with 8 nodes
        >>> cluster = Cluster(n_workers=4,
        >>>                   threads_per_worker=2,
        >>>                   scheduler_port=0,
        >>>                   processes=False)
        >>>
        >>> # Task with little need of the GIL with 16 nodes
        >>> cluster = Cluster(n_workers=2,
        >>>                   threads_per_worker=8,
        >>>                   scheduler_port=0,
        >>>                   processes=False)
    """
    def __init__(self, **kwargs):

        self.kwargs = kwargs
        self.cluster = None
        self.client = None

    def start(self):

        self.cluster = LocalCluster(**self.kwargs)
        self.client = Client(self.cluster)

    def restart(self):

        self.client.restart()
        print(self.client)

    def stop(self):

        self.client.close()
        self.cluster.close()

        self.client = None
        self.cluster = None
Ejemplo n.º 22
0
import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
client = Client('127.0.0.1:8786')
client.restart()
bc = BlazingContext(dask_client=client, network_interface="lo")

# bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/100MB2Part/'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('customer', dir_data_fs + '/customer_*.parquet')

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5"

# [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment']
lp = bc.explain(query)
Ejemplo n.º 23
0
def connect(
    args: argparse.Namespace
) -> typing.Tuple[dask.distributed.Client,
                  typing.Optional[dask.distributed.LocalCluster]]:
    """ Connect to the dask cluster specifed by the arguments in `args`

    Specifically, this function uses args.cluster_location to determine whether
    to start a dask.distributed.LocalCluster (in case args.cluster_location is
    "LOCAL") or to (attempt to) connect to an existing cluster (any other
    value).

    If a local cluster is started, it will use a number of worker processes
    equal to args.num_procs. Each process will use args.num_threads_per_proc
    threads. The scheduler for the local cluster will listen to a random port.

    Parameters
    ----------
    args: argparse.Namespace
        A namespace containing the following fields:
        
        * cluster_location
        * client_restart
        * num_procs
        * num_threads_per_proc

    Returns
    -------
    client: dask.distributed.Client
        The client for the dask connection

    cluster: dask.distributed.LocalCluster or None
        If a local cluster is started, the reference to the local cluster
        object is returned. Otherwise, None is returned.
    """

    from dask.distributed import Client as DaskClient
    from dask.distributed import LocalCluster as DaskCluster

    client = None
    cluster = None

    if args.cluster_location == "LOCAL":

        msg = "[dask_utils]: starting local dask cluster"
        logger.info(msg)

        cluster = DaskCluster(n_workers=args.num_procs,
                              processes=True,
                              threads_per_worker=args.num_threads_per_proc)

        client = DaskClient(cluster)

    else:
        msg = "[dask_utils]: attempting to connect to dask cluster: {}"
        msg = msg.format(args.cluster_location)
        logger.info(msg)

        client = DaskClient(address=args.cluster_location)

        if args.client_restart:
            msg = "[dask_utils]: restarting client"
            logger.info(msg)
            client.restart()

    return client, cluster