Exemple #1
0
def segmentTesting(thisModel,
                   Ysample,
                   Lnum,
                   verbose,
                   label,
                   serialMode=False,
                   optimise=100,
                   calibrate=False):
    """
    Method to test multiple samples at a time.

    Args:
        thisModel : SAMObject model to recall from.
        Ysample : Novel feature vector to test.
        Lnum : Ground truth labels to compare with.
        verbose : Enable or disable logging to stdout.
        label : Label for the current segments being tested.
        serialMode : Boolean to test serially or in parallel.
        optimise : Number of optimisation iterations to perform during recall.
        calibrate : Indicate calibration mode when True which requires a different return.

    Returns:
        labelList, confMatrix, ret, variancesKnown, variancesUnknown if calibrate is `True`.
        labelList, confMatrix, labelComparisonDict if calibrate is `False`.

        labelList : List of classification labels
        confMatrix : Numpy array with the confusion matrix
        ret : Classification object
        variancesKnown : Variances returned during calibration for known training instances
        variancesUnknown : Variances returned during calibration for unknown training instances
        labelComparisonDict : Dictionary with two items `'original'` and `'results'`.

    """
    def testFunc(data, lab):
        d = testSegment(thisModel,
                        data,
                        verbose,
                        visualiseInfo=None,
                        optimise=optimise)
        if verbose:
            if lab == d[0]:
                res = True
            else:
                res = False
            logging.info('Actual  ' + str(lab).ljust(11) + '  Classification:  ' + str(d[0]).ljust(11) + '  with ' + \
                  str(d[1])[:6] + ' confidence: ' + str(res) + '\n')
        return d

    logging.info('')

    if type(Lnum).__module__ == np.__name__:
        useModelLabels = True
    else:
        useModelLabels = False

    if len(thisModel) > 1:
        labelList = copy.deepcopy(thisModel[0].textLabels)
        labelList.append('unknown')
    else:
        labelList = copy.deepcopy(thisModel[0].textLabels)
        labelList.append('unknown')

    confMatrix = np.zeros((len(labelList), len(labelList)))

    numItems = len(Ysample)

    off1 = 11
    off2 = 8
    off3 = len(str(numItems))
    if useModelLabels:
        Lsample = [
            thisModel[0].textLabels[int(Lnum[i])] for i in range(len(Lnum))
        ]
    else:
        Lsample = Lnum

    if numItems < 1500:
        serialMode = True
    c = None
    logging.info('serialMode: ' + str(serialMode))
    if not serialMode and thisModel[0].parallelOperation:
        try:
            logging.info('Trying engines ...')
            c = ipp.Client()
            numWorkers = len(c._engines)
            logging.info('Number of engines: ' + str(numWorkers))
        except:
            logging.error("Parallel workers not found")
            thisModel[0].parallelOperation = False
            numWorkers = 1
    else:
        logging.info(str(serialMode) + '= True')
        thisModel[0].parallelOperation = False
        numWorkers = 1
        logging.info('Number of engines: ' + str(numWorkers))

    # average 5 classifications before providing this time
    vTemp = copy.deepcopy(verbose)
    verbose = False
    if len(Lsample) < 400:
        numTrials = len(Lsample) * 0.1
        numTrials = max(1, int(numTrials))
    else:
        numTrials = 20
    t0 = time.time()
    for j in range(numTrials):
        testFunc(Ysample[j], Lsample[j])
    t1 = time.time()
    verbose = vTemp
    thisModel[0].avgClassTime = (t1 - t0) / numTrials
    logging.info('classification rate: ' +
                 str(1.0 / thisModel[0].avgClassTime) + 'fps')
    logging.info('estimated time: ' +
                 str(thisModel[0].avgClassTime * numItems /
                     (60 * numWorkers)) + 'mins for ' + str(numItems) +
                 ' items with ' + str(numWorkers) + ' workers')
    t0 = time.time()
    logging.info(t0)
    # check size of model
    # modelSize is size in megabytes
    modelSize = deep_getsizeof(thisModel, set()) / 1024.0 / 1024.0
    logging.info("modelSize: " + str(modelSize))
    logging.warning("required testing size: " +
                    str((modelSize * numWorkers * 2) + 400) + " MB")
    # check available system memory in megabytes
    freeSystemMem = float(psutil.virtual_memory()[4]) / 1024.0 / 1024.0
    logging.info("free memory: " + str(freeSystemMem) + " MB")

    if modelSize > 100 or not thisModel[0].parallelOperation or serialMode:
        # serial testing
        logging.warning('Testing serially')
        ret = []
        for j in range(len(Lsample)):
            logging.info(str(j) + '/' + str(len(Lsample)))
            ret.append(testFunc(Ysample[j], Lsample[j]))
    else:
        # parallel testing
        logging.info('Testing in parallel')
        dview = c[:]  # not load balanced
        lb = c.load_balanced_view()  # load balanced

        # with dview.sync_imports():
        #     from SAM.SAM_Core import utils
        # if not thisModel[0].modelLoaded :
        dview.push({'thisModel': thisModel})
        dview.push({'verbose': verbose})
        dview.push({'optimise': optimise})
        # thisModel[0].modelLoaded = True
        syn = lb.map_async(testFunc, Ysample, Lsample)
        wait_watching_stdout(syn, dt=1, truncate=1000)
        ret = syn.get()
        # maybe these are upsetting the ipcluster
        # dview.clear()
        # dview.purge_results('all')
    t1 = time.time()
    logging.info(t1)
    logging.info('Actual time taken = ' + str(t1 - t0))
    if calibrate:
        variancesKnown = []
        variancesUnknown = []
        for i in range(len(ret)):
            currLabel = Lsample[i]

            if verbose:
                if currLabel == ret[i][0]:
                    result = True
                else:
                    result = False
                logging.info(
                    str(i).rjust(off3) + '/' + str(numItems) + ' Truth: ' +
                    currLabel.ljust(off1) + ' Model: ' +
                    ret[i][0].ljust(off1) + ' with ' +
                    str(ret[i][1])[:6].ljust(off2) + ' confidence: ' +
                    str(result))

            if currLabel in thisModel[0].textLabels and currLabel != "unknown":
                knownLabel = True
            else:
                knownLabel = False
                currLabel = 'unknown'

            if knownLabel:
                variancesKnown.append(ret[i][1])
            else:
                variancesUnknown.append(ret[i][1])

            confMatrix[labelList.index(currLabel),
                       labelList.index(ret[i][0])] += 1

        return labelList, confMatrix, ret, variancesKnown, variancesUnknown
    else:
        labelComparisonDict = dict()
        labelComparisonDict['original'] = []
        labelComparisonDict['results'] = []
        for i in range(len(ret)):
            currLabel = Lsample[i]
            retLabel = ret[i][0]

            if currLabel not in thisModel[0].textLabels:
                currLabel = 'unknown'

            if verbose:
                if currLabel == retLabel:
                    result = True
                else:
                    result = False
                logging.info(
                    str(i).rjust(off3) + '/' + str(numItems) + ' Truth: ' +
                    currLabel.ljust(off1) + ' Model: ' + retLabel.ljust(off1) +
                    ' with ' + str(ret[i][1])[:6].ljust(off2) +
                    ' confidence: ' + str(result))

            labelComparisonDict['original'].append(Lsample[i])
            labelComparisonDict['results'].append(retLabel)
            confMatrix[labelList.index(currLabel),
                       labelList.index(retLabel)] += 1
        return labelList, confMatrix, labelComparisonDict
Exemple #2
0
import ipyparallel as ipp

client = ipp.Client()


# this will only run on machines that can import numpy:
@ipp.require('numpy')
def norm(A):
    from numpy.linalg import norm
    return norm(A, 2)


def checkpid(pid):
    """return the pid of the engine"""
    import os
    return os.getpid() == pid


def checkhostname(host):
    import socket
    return socket.gethostname() == host


def getpid():
    import os
    return os.getpid()


pid0 = client[0].apply_sync(getpid)

Exemple #3
0
#!/usr/bin/env python3
try:
    if (directview):
        pass
except:
    import ipyparallel
    c = ipyparallel.Client(profile="mpi_slurm", cluster_id="Azure_cluster_0")
    directview = c[:]
    directview.block = True
    with directview.sync_imports():
        import numpy
        import mpi4py
        from mpi4py import MPI


class rankinfo(object):
    '''This holds a few "global" values of our problem,
    like stencil width and problem size.

    Parameters
    ----------

    sizes : tuple of numbers of lattice points without ghost
            points along the coordinate axes (in x,y,z order)

    Attributes
    ----------

    rank : the rank of this class in MPI.COMM_WORLD
    size : the number of ranks in  MPI.COMM_WORLD
    ndim : how many physical dimensions does our lattice have
if arguments.timeseries_name is None:
    directory_full_path = arguments.directory
else:
    directory_full_path = os.path.join(arguments.directory, arguments.timeseries_name)

if not os.path.exists(directory_full_path):
    raise Exception("Directory %s does not exists." % directory_full_path)

if arguments.cluster_sample_count < 1:
    raise Exception("Cluster sample count must be greater than zero.")

_numSamples = arguments.cluster_sample_count

try:
    pool = ipyparallel.Client(profile=arguments.profile)[:]
except:
    raise Exception("A running IPython parallel cluster is required to run this script.")
def tardir(path):
    # ziph is zipfile handle
    with tarfile.open(os.path.join(path, 'slycat-timeseries.tar.gz'), 'w:gz') as tarh:
        for root, dirs, files in os.walk(path):
            for file in files:
                if file != 'slycat-timeseries.tar.gz':
                    tarh.add(os.path.join(root, file), arcname=file)
# Compute the model.
try:
    print("Examining and verifying data.")
    """
    Find number of timeseries and accurate cluster sample count before starting model
    """
Exemple #5
0
    def __init__(self,
                 wrapper,
                 backend='multiprocessing',
                 n_cpus=-1,
                 verbosity=10,
                 dask_args=None):

        # -1 cpus means all available cpus - 1 for the scheduler
        if n_cpus == -1:
            import multiprocessing
            n_cpus = multiprocessing.cpu_count() - 1

        self.n_cpus = n_cpus
        self.wrapper = wrapper
        self.verbosity = verbosity
        self.dask_args = dask_args
        # This configures how to run single point simulations on the model:
        self._exec = self.wrapper

        ot.OpenTURNSPythonFunction.__init__(self,
                                            self.wrapper.getInputDimension(),
                                            self.wrapper.getOutputDimension())

        self.setInputDescription(self.wrapper.getInputDescription())
        self.setOutputDescription(self.wrapper.getOutputDescription())

        assert backend in [
            'ipython', 'ipyparallel', 'multiprocessing', 'pathos', 'joblib',
            'dask'
        ], "Unknown backend"

        # This configures how to run samples on the model :
        if self.n_cpus == 1:
            self._exec_sample = self.wrapper

        elif (backend == 'ipython') or (backend == 'ipyparallel'):
            # Check that ipyparallel is installed
            try:
                import ipyparallel as ipp
                # If it is, see if there is a cluster running
                try:
                    rc = ipp.Client()
                    ipy_backend = True
                except (ipp.error.TimeoutError, IOError) as e:
                    ipy_backend = False
                    import logging
                    logging.warning('Unable to connect to an ipython cluster.')
            except ImportError:
                ipy_backend = False
                import logging
                logging.warning('ipyparallel package missing.')

            if ipy_backend:
                self._exec_sample = _exec_sample_ipyparallel(
                    self.wrapper, self.getInputDimension(),
                    self.getOutputDimension())
            else:
                logging.warning('Using multiprocessing backend instead')
                self._exec_sample = _exec_sample_multiprocessing(
                    self.wrapper, self.n_cpus)

        elif backend == 'joblib':
            # Check that joblib is installed
            try:
                import joblib
                joblib_backend = True
            except ImportError:
                try:
                    from sklearn.externals import joblib
                    joblib_backend = True
                except ImportError:
                    joblib_backend = False
                    import logging
                    logging.warning('joblib package missing.')

            if joblib_backend:
                self._exec_sample = _exec_sample_joblib(
                    self.wrapper, self.n_cpus, self.verbosity)
            else:
                logging.warning('Using multiprocessing backend instead')
                self._exec_sample = _exec_sample_multiprocessing(
                    self.wrapper, self.n_cpus)

        elif backend == 'multiprocessing':
            self._exec_sample = _exec_sample_multiprocessing(
                self.wrapper, self.n_cpus)

        elif backend == 'pathos':
            self._exec_sample = _exec_sample_pathos(self.wrapper, self.n_cpus)

        elif backend == 'dask':

            assert 'scheduler' in self.dask_args, 'dask_args must have "scheduler" as key'
            assert 'workers' in self.dask_args, 'dask_args must have "workers" as key'

            self._exec_sample, self.dask_cluster, self.dask_client = _exec_sample_dask(
                self.wrapper, self.dask_args, self.verbosity)

            def close_dask():
                from time import sleep
                self.dask_client.close()
                sleep(1)
                self.dask_cluster.close()

            self.close_dask = close_dask
Exemple #6
0
import ipyparallel as ipp
import pandas as pd
import tables as tb
import os
import argparse

parser = argparse.ArgumentParser(description='Model and year to estimate.')
parser.add_argument('model', type=str, nargs='?', default='gpin')
parser.add_argument('year', type=int, nargs='?', default=2014)
args = parser.parse_args()
print(vars(args))

rc = ipp.Client(cluster_id="{0}-{1}".format(args.model, args.year))
print(len(rc))
dv = rc[:]
dv.push(vars(args))
lv = rc.load_balanced_view()

h5 = tb.open_file('/scratch/nyu/hue/taqdf_1319.h5', mode='r')
df = h5.get_node('/data/table')
idx = list(
    set(
        filter(lambda x: x[1] == args.year,
               zip(df.col('permno'), df.col('yyyy')))))


@ipp.interactive
def est(x):
    import os
    import pandas as pd
    import tables as tb
Exemple #7
0
def get_client(cluster_id, profile, engines, timeout, cores, quiet, **kwargs):
    """ 
    Creates a client to view ipcluster engines for a given profile and 
    returns it with at least one engine spun up and ready to go. If no 
    engines are found after nwait amount of time then an error is raised.
    If engines==MPI it waits a bit longer to find engines. If the number
    of engines is set then it waits even longer to try to find that number
    of engines.
    """

    ## save stds for later, we're gonna hide them to prevent external printing 
    devnull = open(os.devnull, 'w')
    save_stdout = sys.stdout 
    save_stderr = sys.stderr
    sys.stdout = devnull
    sys.stderr = devnull

    ## get cluster_info print string
    connection_string = "  establishing parallel connection:"

    ## wrapped search for ipcluster
    try: 
        ## are we looking for a running ipcluster instance?
        if profile not in [None, "default"]:
            args = {'profile': profile, "timeout": timeout}
        else:
            clusterargs = [cluster_id, profile, timeout]
            argnames = ["cluster_id", "profile", "timeout"]
            args = {key:value for key, value in zip(argnames, clusterargs)}

        ## get connection within timeout window of wait time and hide messages
        ipyclient = ipp.Client(**args)
        sys.stdout = save_stdout
        sys.stderr = save_stderr

        ## check that all engines have connected            
        if (engines == "MPI") or ("pta-cli-" in cluster_id):
            if not quiet:
                print(connection_string)

        for _ in range(6000):
            initid = len(ipyclient)
            time.sleep(0.01)
            ## If MPI then wait for all engines to start so we can report
            ## how many cores are on each host. If Local then only wait for
            ## one engine to be ready and then just go.
            if (engines == "MPI") or ("pta-cli-" in cluster_id):
                ## wait for cores to be connected
                if cores:
                    time.sleep(0.1)
                    if initid == cores:
                        break
                if initid:
                    time.sleep(3)
                    if len(ipyclient) == initid:
                        break
            else:
                if cores:
                    if initid == cores:
                        break
                else:
                    if initid:
                        break

    except KeyboardInterrupt as inst:
        raise inst

    ## This is raised if ipcluster is not running ------------
    except IOError as inst:
        if "pta-cli-" in cluster_id:
            raise PTAError(NO_IPCLUSTER_CLI)
        else:
            raise PTAError(NO_IPCLUSTER_API)

    except (ipp.TimeoutError, ipp.NoEnginesRegistered) as inst:
        raise inst

    except Exception as inst:
        raise inst

    finally:
        ## ensure that no matter what we reset the stds
        sys.stdout = save_stdout
        sys.stderr = save_stderr

    return ipyclient
Exemple #8
0
    def setup(self, number_of_engines, is_coalescing, depth):
        self.client = ipp.Client(profile='asv', cluster_id=f'depth_{depth}')
        self.view = self.client.broadcast_view(is_coalescing=is_coalescing)
        self.view.targets = list(range(number_of_engines))

        wait_for(lambda: len(self.client) >= number_of_engines)
Exemple #9
0
 def setup(self, number_of_engines, number_of_bytes):
     self.client = ipp.Client(profile='asv')
     self.view = get_view(self)
     self.view.targets = list(range(number_of_engines))
     wait_for(lambda: len(self.client) >= number_of_engines)
# import IPython.parallel as ipp   # Python 2
import ipyparallel as ipp     # Python 3
rc = ipp.Client(profile='default', cluster_id='')
rc.ids

dview = rc[:]
dview.block = True
dview.apply(lambda : "Hello, World")

lview = rc.load_balanced_view()
lview.block = True

import pandas 
dat = pandas.read_csv('/global/scratch/paciorek/bayArea.csv', header = None, encoding = 'latin1')
dat.columns = ('Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime',
'ArrTime','CRSArrTime','UniqueCarrier','FlightNum','TailNum',
'ActualElapsedTime','CRSElapsedTime','AirTime','ArrDelay','DepDelay',
'Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled','CancellationCode',
'Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')

dview.execute('import statsmodels.api as sm')

dat2 = dat.loc[:, ('DepDelay','Year','Dest','Origin')]
dests = dat2.Dest.unique()

mydict = dict(dat2 = dat2, dests = dests)
dview.push(mydict)

def f(id):
    sub = dat2.loc[dat2.Dest == dests[id],:]
    sub = sm.add_constant(sub)
Exemple #11
0
def run(func_parallel_loop,
        func_gen_args,
        func_init=None,
        base_dir=None,
        results_dir=None,
        description=None):
    '''
    Runs the simulation

    Parameters
    ----------
    func_parallel_loop: function
        The function that should be parallelized
    func_gen_args: function
        The function that will generate all the different inputs
        for func_parallel_loop
    func_init: function, optional
        A function that will be run before the simulation starts. This might
        generate some data or import some files for example
    base_dir: str, optional
        The location of the base directory for the simulation
    results_dir: str, optional
        The name of the directory where to save results
    description: str, optional
        A short description of the simulation for the help function
    '''
    import os, json

    if description is None:
        description = 'Generic simulation script'

    if base_dir is None:
        base_dir = './'
    base_dir = os.path.abspath(base_dir)

    if results_dir is None:
        results_dir = os.path.join(base_dir, 'data/')
    elif not os.path.isabs(results_dir):
        results_dir = os.path.join(base_dir, results_dir)

    # create results directory if it doesn't exist
    if not os.path.exists(results_dir):
        os.mkdir(results_dir)

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('-d',
                        '--dir',
                        type=str,
                        help='directory to store sim results')
    parser.add_argument('-p',
                        '--profile',
                        type=str,
                        help='ipython profile of cluster')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='test mode, runs a single loop of the simulation')
    parser.add_argument('-s',
                        '--serial',
                        action='store_true',
                        help='run in a serial loop, ipyparallel not called')
    parser.add_argument(
        '--dummy',
        action='store_true',
        help=
        'tags the directory as dummy, can be used for running small batches')
    parser.add_argument('parameters',
                        type=str,
                        help='JSON file containing simulation parameters')

    cli_args = parser.parse_args()
    ipcluster_profile = cli_args.profile
    test_flag = cli_args.test
    serial_flag = cli_args.serial
    dummy_flag = cli_args.dummy
    data_dir_name = None
    parameter_file = cli_args.parameters

    # Check the state of the github repository
    if dummy_flag:
        tag = 'dummy'

    else:
        # Not a dummy run, try to get the git hash
        try:
            tag = get_git_hash(base_dir, length=10)

        except DirtyGitRepositoryError:
            if test_flag:
                import warnings
                warnings.warn(
                    'The git repo has uncommited modifications. Going ahead for test.'
                )
                tag = 'test'
            else:
                raise ValueError(
                    'The git repo has uncommited modifications. Aborting simulation.'
                )

        except InvalidGitRepositoryError:
            tag = ''

    # get all the parameters
    with open(parameter_file, 'r') as f:
        parameters = json.load(f)

    # if no name is given, use the parameters file name
    if 'name' not in parameters:
        name = os.path.splitext(os.path.basename(parameter_file))[0]
        parameters['name'] = name
    else:
        name = parameters['name']

    # record date and time
    date = time.strftime("%Y%m%d-%H%M%S")

    # for convenient access to parameters:
    p = collections.namedtuple('Struct',
                               parameters.keys())(*parameters.values())

    # Save the result to a directory
    if data_dir_name is None:
        ttag = '_' + tag if tag != '' else tag
        data_dir = os.path.join(
            results_dir, data_dir_format.format(date=date, name=name,
                                                tag=ttag))
    else:
        data_dir = data_dir_name
    data_file_name = os.path.join(data_dir, data_file)

    # create directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    # add a few practical things to the parameters
    parameters['_git_sha'] = tag
    parameters['_date'] = date
    parameters['_base_dir'] = base_dir
    parameters['_results_dir'] = data_dir
    parameters['_parallel'] = not serial_flag

    # Save the arguments in a json file
    param_file_name = os.path.join(data_dir, param_file)
    with open(param_file_name, "w") as f:
        json.dump(parameters, f, indent=2)
        f.close()

    # run the user provided init method
    if func_init is not None:
        func_init(parameters)

    # generate all the arguments to simulate
    arguments = func_gen_args(parameters)

    # Save the arguments in a json file
    args_file_name = os.path.join(data_dir, args_file)
    with open(args_file_name, "w") as f:
        json.dump(arguments, f, indent=0)
        f.close()

    # There is the option to only run one loop for test
    if test_flag:
        print('Running one test loop only.')
        arguments = arguments[:2]

    # Prepare a few things for the status line
    n_tasks = len(arguments)
    digits = int(math.log10(n_tasks) + 1)
    dformat = '{:' + str(digits) + 'd}'
    status_line = ('   ' + dformat + '/' + dformat +
                   (' tasks done. '
                    'Forecast end {:>20s}. '
                    'Ellapsed: {:>8s} Remaining: {:>8s}'))

    print('/!\\ the time estimate will only be correct '
          'when all tasks take about the same time to finish /!\\')

    forecast = 'NA'
    time_remaining = 'NA'

    # Main processing loop
    if serial_flag:
        # add parameters to builtins so that it is accessible in the namespace
        # of the calling script
        import builtins
        builtins.parameters = parameters

        print('Running everything in a serial loop.')

        # record start timestamp
        then = time.time()
        start_time = datetime.datetime.now()

        # Serial processing
        for i, ag in enumerate(arguments):
            result = func_parallel_loop(ag)

            # save the new result!
            json_append(data_file_name, result)

            # Now format some timing estimation
            n_remaining = n_tasks - (i + 1)

            ellapsed = int(time.time() - then)
            ellapsed_fmt = '{:02}:{:02}:{:02}'.format(ellapsed // 3600,
                                                      ellapsed % 3600 // 60,
                                                      ellapsed % 60)

            # estimate remaining time
            if ellapsed > 0:
                rate = (i + 1) / ellapsed  # tasks per second
                delta_finish_min = int(rate * n_remaining / 60) + 1

                tdelta = datetime.timedelta(minutes=delta_finish_min)
                end_date = datetime.datetime.now() + tdelta

                # convert to strings
                forecast = end_date.strftime('%Y-%m-%d %H:%M:%S')
                s = int(tdelta.total_seconds())
                time_remaining = '{:02}:{:02}:{:02}'.format(
                    s // 3600, s % 3600 // 60, s % 60)

            formatted_status_line = status_line.format(i + 1, n_tasks,
                                                       forecast, ellapsed_fmt,
                                                       time_remaining)
            print(formatted_status_line, end='\r')

        # clean the output
        print(' ' * len(formatted_status_line))

        all_loops = int(time.time() - then)
        all_loops_format = '{:02}:{:02}:{:02}'.format(all_loops // 3600,
                                                      all_loops % 3600 // 60,
                                                      all_loops % 60)

        print('Total actual processing time: {} ({} s)'.format(
            all_loops_format, all_loops))

    else:
        # Parallel processing code
        import ipyparallel as ip

        print('Using ipyparallel processing.')

        # Start the parallel processing
        c = ip.Client(profile=ipcluster_profile)
        NC = len(c.ids)
        print(NC, 'workers on the job')

        # Clear the engines namespace
        c.clear(block=True)

        # Push the global config to the workers
        var_space = dict(parameters=parameters, )
        c[:].push(var_space, block=True)

        # record start timestamp
        then = time.time()
        start_time = datetime.datetime.now()

        # use a load balanced view
        lbv = c.load_balanced_view()

        # dispatch to workers
        ar = lbv.map_async(func_parallel_loop, arguments)

        # We use a try here so that if something happens,
        # we can catch it and abort the jobs on all engines
        try:
            for i, result in enumerate(ar):

                # save the new result!
                json_append(data_file_name, result)

                # Now format some timing estimation
                n_remaining = n_tasks - ar.progress

                ellapsed = int(time.time() - then)
                ellapsed_fmt = '{:02}:{:02}:{:02}'.format(
                    ellapsed // 3600, ellapsed % 3600 // 60,
                    round(ellapsed % 60))

                if ar.progress > NC and n_remaining > NC:

                    # estimate remaining time
                    rate = ellapsed / ar.progress  # tasks per second
                    delta_finish_min = int(rate * n_remaining / 60) + 1

                    tdelta = datetime.timedelta(minutes=delta_finish_min)
                    end_date = datetime.datetime.now() + tdelta

                    # convert to strings
                    forecast = end_date.strftime('%Y-%m-%d %H:%M:%S')
                    s = int(tdelta.total_seconds())
                    time_remaining = '{:02}:{:02}:{:02}'.format(
                        s // 3600, s % 3600 // 60, s % 60)

                formatted_status_line = status_line.format(
                    ar.progress, n_tasks, forecast, ellapsed_fmt,
                    time_remaining)
                print(formatted_status_line, end='\r')

            # clean the output
            print(' ' * len(formatted_status_line))

            print('Show all output from nodes, if any:')
            ar.display_outputs()

        except:
            # so here, things went south. Show the traceback
            # and abort all the jobs scheduled

            import traceback
            traceback.print_exc()

            print('Aborting all remaining jobs...')
            c.abort(block=True)

        all_loops = int(time.time() - then)
        all_loops_format = '{:02}:{:02}:{:02}'.format(all_loops // 3600,
                                                      all_loops % 3600 // 60,
                                                      all_loops % 60)

        print('Total actual processing time: {} ({} s)'.format(
            all_loops_format, all_loops))

    print('Saved data to folder: ' + data_dir)
Exemple #12
0
import ipyparallel

ipp_client = ipyparallel.Client(
    url_file="/groups/turaga/home/grisaitisw/.ipython/profile_greentea/security/ipcontroller-client.json",
    timeout=60 * 60  # 1 hour
)
executor = ipp_client.load_balanced_view()
executor.set_flags(retries=100000)
Exemple #13
0
def main():
    """ main function """
    ## turn off traceback for the CLI
    ip.__interactive__ = 0

    ## Check for a new version on anaconda
    _check_version()

    ## parse params file input (returns to stdout if --help or --version)
    args = parse_command_line()

    ## Turn the debug output written to ipyrad_log.txt up to 11!
    ## Clean up the old one first, it's cleaner to do this here than
    ## at the end (exceptions, etc)
    if os.path.exists(ip.__debugflag__):
        os.remove(ip.__debugflag__)

    if args.debug:
        print("\n  ** Enabling debug mode ** ")
        ip._debug_on()
        atexit.register(ip._debug_off)        

    ## create new paramsfile if -n
    if args.new:
        ## Create a tmp assembly, call write_params to make default params.txt
        try:
            tmpassembly = ip.Assembly(args.new, quiet=True, cli=True)
            tmpassembly.write_params("params-{}.txt".format(args.new), 
                                     force=args.force)
        except Exception as inst:
            print(inst)
            sys.exit(2)

        print("\n  New file 'params-{}.txt' created in {}\n".\
               format(args.new, os.path.realpath(os.path.curdir)))
        sys.exit(2)


    ## if params then must provide action argument with it
    if args.params:
        if not any([args.branch, args.results, args.steps]):
            print("""
    Must provide action argument along with -p argument for params file. 
    e.g., ipyrad -p params-test.txt -r              ## shows results
    e.g., ipyrad -p params-test.txt -s 12           ## runs steps 1 & 2
    e.g., ipyrad -p params-test.txt -b newbranch    ## branch this assembly
    """)
            sys.exit(2)

    if not args.params:
        if any([args.branch, args.results, args.steps]):
            print("""
    Must provide params file for branching, doing steps, or getting results.
    e.g., ipyrad -p params-test.txt -r              ## shows results
    e.g., ipyrad -p params-test.txt -s 12           ## runs steps 1 & 2
    e.g., ipyrad -p params-test.txt -b newbranch    ## branch this assembly
    """)

    ## if branching, or merging do not allow steps in same command
    ## print spacer
    if any([args.branch, args.merge]):        
        args.steps = ""    
        print("")    

    ## always print the header when doing steps
    header = \
    "\n -------------------------------------------------------------"+\
    "\n  ipyrad [v.{}]".format(ip.__version__)+\
    "\n  Interactive assembly and analysis of RAD-seq data"+\
    "\n -------------------------------------------------------------"

    ## Log the current version. End run around the LOGGER
    ## so it'll always print regardless of log level.
    with open(ip.__debugfile__, 'a') as logfile:
        logfile.write(header)
        logfile.write("\n  Begin run: {}".format(time.strftime("%Y-%m-%d %H:%M")))
        logfile.write("\n  Using args {}".format(vars(args)))
        logfile.write("\n  Platform info: {}".format(os.uname()))

    ## if merging just do the merge and exit
    if args.merge:
        print(header)
        merge_assemblies(args)
        sys.exit(1)

    ## create new Assembly or load existing Assembly, quit if args.results
    elif args.params:
        parsedict = parse_params(args)

        if args.branch:
            branch_assembly(args, parsedict)

        elif args.steps:
            ## print header
            print(header)

            ## Only blank the log file if we're actually going to run a new
            ## assembly. This used to be in __init__, but had the side effect
            ## of occasionally blanking the log file in an undesirable fashion
            ## for instance if you run a long assembly and it crashes and
            ## then you run `-r` and it blanks the log, it's crazymaking.
            if os.path.exists(ip.__debugfile__):
                if os.path.getsize(ip.__debugfile__) > 50000000:
                    with open(ip.__debugfile__, 'w') as clear:
                        clear.write("file reset")

            ## run Assembly steps
            ## launch or load assembly with custom profile/pid
            data = getassembly(args, parsedict)

            ## set CLI ipcluster terms
            data._ipcluster["threads"] = args.threads

            ## if ipyclient is running (and matched profile) then use that one
            if args.ipcluster:
                ipyclient = ipp.Client(profile=args.ipcluster)
                data._ipcluster["cores"] = len(ipyclient)

            ## if not then we need to register and launch an ipcluster instance
            else:
                ## set CLI ipcluster terms
                ipyclient = None
                data._ipcluster["cores"] = args.cores if args.cores else detect_cpus()
                data._ipcluster["engines"] = "Local"
                if args.MPI:
                    data._ipcluster["engines"] = "MPI"
                    if not args.cores:
                        raise IPyradWarningExit("must provide -c argument with --MPI")
                ## register to have a cluster-id with "ip- name"
                data = register_ipcluster(data)

            ## set to print headers
            data._headers = 1

            ## run assembly steps
            steps = list(args.steps)
            data.run(
                steps=steps, 
                force=args.force, 
                preview=args.preview, 
                show_cluster=1, 
                ipyclient=ipyclient)
                     
        if args.results:
            showstats(parsedict)
Exemple #14
0
 def __init__(self, profile=None, cluster_id=None):
     self.client = ipp.Client(profile=profile, cluster_id=cluster_id)
     self.statusDict = {}
     self.sleepSeconds = SLEEP_SECONDS
     self.keyField = 'key'
Exemple #15
0
def calculate_background(
    filename,
    output_dir,
    *,
    check_validity_channel=False,
    th_factor=3.0,
    above_threshold_pixel_ratio_max=0.05,
    below_threshold_pixel_ratio_max=0.05,
    valid_ratio_threshold=0.4,
    intensity_bin_size=25,
    thumbnail_size=20,
    quantile=0.001,
    ipcluster_nproc=1,
):
    params_dict = locals()
    cli = ipp.Client(profile="default")
    dview = cli[:]
    dview.clear()
    bview = cli.load_balanced_view()
    dview.execute("""
    import javabridge
    import bioformats as bf
    import pycziutils
    javabridge.start_vm(class_path=bf.JARS)
    """)

    os.makedirs(output_dir, exist_ok=True)
    log_dir = path.join(output_dir, "calcluate_background_log")
    os.makedirs(log_dir, exist_ok=True)

    def savefig(fig, name):
        fig.savefig(path.join(log_dir, name), bbox_inches="tight")

    ############## Load files ################
    meta = pycziutils.get_tiled_omexml_metadata(filename)
    with open(path.join(output_dir, "metadata.xml"), "w") as f:
        f.write(meta)

    reader = pycziutils.get_tiled_reader(filename)
    sizeS, sizeT, sizeC, sizeX, sizeY, sizeZ = pycziutils.summarize_image_size(
        reader)

    pixel_sizes = pycziutils.parse_pixel_size(meta)
    assert pixel_sizes[1] == "µm"
    channels = pycziutils.parse_channels(meta)
    channel_names = [c["@Fluor"] for c in channels]
    print(channel_names)
    params_dict.update({
        "channel_names": channel_names,
    })
    if check_validity_channel:
        check_validity_channel_index = [
            j for j, c in enumerate(channels)
            if check_validity_channel in c["@Fluor"]
        ][0]

    planes_df = pycziutils.parse_planes(meta)
    null_indices = planes_df.isnull().any(axis=1)
    params_dict["null_indices"] = list(planes_df[null_indices].index)
    planes_df = planes_df.loc[~null_indices, :]
    planes_df["S_index"] = planes_df["image"]

    if check_validity_channel:
        ############## Summarize image intensities ################
        send_variable(dview, "filename", path.abspath(filename))
        send_variable(dview, "read_image", read_image)
        send_variable(dview, "summarize_image", summarize_image)
        dview.execute("_reader = pycziutils.get_tiled_reader(filename)")
        check_ipcluster_variable_defined(dview, "_reader", timeout=120)
        sleep(10)
        check_ipcluster_variable_defined(dview, "read_image", timeout=120)
        check_ipcluster_variable_defined(dview, "summarize_image", timeout=120)

        @ipp.require(summarize_image)
        def _summarize_image(row):
            return summarize_image(row, _reader, thumbnail_size, quantile)  # pylint: disable=undefined-variable

        res = bview.map_async(_summarize_image,
                              [row for _, row in list(planes_df.iterrows())])
        res.wait_interactive()
        keys = ["thumbnail", "max", "min", "mean", "median", "stdev"]
        for i, k in enumerate(keys):
            planes_df[k] = [r[i] for r in res.get()]
        display(planes_df)

        ############## Calculate most frequent "standard" mean and stdev for a image ##############
        mean_mode = {}
        stdev_mode = {}
        for iC, grp in planes_df.groupby("C_index"):
            fig, ax = plt.subplots(1, 1, figsize=(10, 10))
            c_name = channel_names[iC]
            h, *edges, im = ax.hist2d(grp["mean"],
                                      grp["stdev"],
                                      bins=intensity_bin_size)
            mean_mode[iC], stdev_mode[iC] = [
                float((edge[x[0]] + edge[x[0] + 1]) / 2.0)
                for edge, x in zip(edges, np.where(h == np.max(h)))
            ]
            ax.plot(mean_mode[iC], stdev_mode[iC], "ro")
            ax.set_xlabel("mean intensity")
            ax.set_ylabel("stdev intensity")
            ax.set_title(c_name)
            savefig(fig, f"1_mean_and_stdev_instensities_{iC}_{c_name}.pdf")

        m, s = (
            mean_mode[check_validity_channel_index],
            stdev_mode[check_validity_channel_index],
        )
        th_low = m - th_factor * s
        th_high = m + th_factor * s
        params_dict.update({
            "mean_mode": mean_mode,
            "stdev_mode": stdev_mode,
            "ph_th_low": float(th_low),
            "ph_th_high": float(th_high),
        })

        ph_planes_df = planes_df[planes_df["C_index"] ==
                                 check_validity_channel_index].copy()

        thumbail_output_name = "2_thresholded_thumbnail"
        thumbail_output_path = path.join(log_dir, thumbail_output_name)
        os.makedirs(thumbail_output_path, exist_ok=True)
        for iS, grp in ph_planes_df.groupby("S_index"):
            fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            img_mean = grp["thumbnail"].iloc[0]
            axes[0].imshow(img_mean, vmin=th_low, vmax=th_high)
            axes[1].hist(img_mean.flatten(), bins=20, range=(0, 8000))
            axes[1].set_xlabel("intensity")
            axes[1].set_ylabel("freq")
            fig.suptitle("series " + str(iS) + " below th count: " +
                         str(np.sum(img_mean < m - th_factor * s)) +
                         " above th count: " +
                         str(np.sum(img_mean > m + th_factor * s)))
            savefig(
                fig,
                path.join(thumbail_output_name,
                          f"2_thresholded_thumbnails_{iS}.pdf"),
            )
            plt.close("all")

        sigma = 20 / float(pixel_sizes[0])
        params_dict.update({"sigma": sigma})
        send_variable(dview, "threshold_image", threshold_image)
        res = bview.map_async(
            lambda row: threshold_image(row, _reader, sigma, th_low, th_high),  # pylint: disable=undefined-variable
            [row for _, row in list(ph_planes_df.iterrows())],
        )
        res.wait_interactive()
        print("ok")
        ph_planes_df["below_th_count"] = [r[0] for r in res.get()]
        ph_planes_df["above_th_count"] = [r[1] for r in res.get()]
        ph_planes_df[
            "below_th_ratio"] = ph_planes_df["below_th_count"] / sizeX / sizeY
        ph_planes_df[
            "above_th_ratio"] = ph_planes_df["above_th_count"] / sizeX / sizeY
        print("ok")
        ph_planes_df.drop("thumbnail",
                          axis=1).to_csv(path.join(log_dir,
                                                   "ph_planes_df.csv"))

        ############## judge if the position is valid to calculate background ##############
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
        ph_planes_df["is_valid"] = (
            ph_planes_df["below_th_ratio"] < below_threshold_pixel_ratio_max
        ) & (ph_planes_df["above_th_ratio"] < above_threshold_pixel_ratio_max)
        ax.scatter(
            ph_planes_df["below_th_ratio"],
            ph_planes_df["above_th_ratio"],
            c=ph_planes_df["is_valid"],
            s=1,
            marker="o",
            cmap=plt.get_cmap("viridis"),
            alpha=0.3,
        )
        ax.set_xlabel("below threshold ratios")
        ax.set_ylabel("above threshold ratios")
        fig.tight_layout()
        savefig(fig, f"4_threshold_results.pdf")

        series_df = pd.DataFrame()
        for Si, grp in ph_planes_df.groupby("S_index"):
            X = grp["X"].iloc[0]
            assert np.all(X == grp["X"])
            Y = grp["Y"].iloc[0]
            assert np.all(Y == grp["Y"])
            series_df = series_df.append(
                pd.DataFrame(
                    {
                        "thumbnail": [np.mean(grp["thumbnail"], axis=0)],
                        "is_valid_ratio": grp["is_valid"].sum() / len(grp),
                        "X": X,
                        "Y": Y,
                    },
                    index=[Si],
                ))

        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
        im = axes[0].scatter(series_df["X"],
                             series_df["Y"],
                             c=series_df["is_valid_ratio"])
        axes[0].set_title("valid_ratio")
        fig.colorbar(im, ax=axes[0])
        axes[1].scatter(
            series_df["X"],
            series_df["Y"],
            c=series_df["is_valid_ratio"] > valid_ratio_threshold,
        )
        axes[1].set_title("thresholded")
        fig.tight_layout()
        savefig(fig, f"5_valid_positions.pdf")

        series_df[
            "is_valid"] = series_df["is_valid_ratio"] > valid_ratio_threshold
        series_df.drop("thumbnail",
                       axis=1).to_csv(path.join(log_dir, "series_df.csv"))
        valid_series = series_df[series_df["is_valid"]].index
        planes_df["is_valid"] = planes_df["S_index"].isin(valid_series)
    else:
        planes_df["is_valid"] = True

    valid_planes_df = planes_df[planes_df["is_valid"]]
    print("valid_positions:", len(valid_planes_df))

    planes_df.drop("thumbnail", axis=1, errors="ignore").to_csv(
        path.join(output_dir, "planes_df.csv"))

    ############## calclulate backgrounds ##############
    # t.c.z.y.x
    median_images = np.empty((sizeT, sizeC, sizeZ, sizeY, sizeX))
    mean_images = np.empty((sizeT, sizeC, sizeZ, sizeY, sizeX))
    median_images[...] = np.nan
    mean_images[...] = np.nan
    print(sizeT)
    #    assert np.array_equal(valid_planes_df["T_index"].unique(),np.arange(sizeT))
    #    assert np.array_equal(valid_planes_df["C_index"].unique(),np.arange(sizeC))
    #    assert np.array_equal(valid_planes_df["Z_index"].unique(),np.arange(sizeZ))

    for (iC, iT, iZ), grp in tqdm(
            valid_planes_df.groupby(["C_index", "T_index", "Z_index"])):
        imgs = []
        for i, row in grp.iterrows():
            imgs.append(read_image(row, reader))
        imgs = np.array(imgs)
        lq = np.quantile(imgs, quantile, axis=0)
        hq = np.quantile(imgs, 1.0 - quantile, axis=0)
        mask = np.logical_or(imgs < lq, imgs > hq)
        imgs_trunc = ma.array(imgs, mask=mask)
        median_images[iT, iC, iZ, ...] = np.median(imgs, axis=0)
        mean_images[iT, iC, iZ, ...] = imgs_trunc.mean(axis=0)

    print("saving background...")
    with h5py.File(path.join(output_dir, "background_per_tile.hdf5"),
                   "w") as h5f:
        h5f.create_dataset("median_images", data=median_images)
        h5f.create_dataset("mean_images", data=mean_images)
        #        h5f.attrs["channels"]=channels
        h5f.attrs["dimension_order"] = "tczyx"
    print("saved background")

    ############## check correlation of backgrounds ##############
    for iC, iZ in itertools.product(range(sizeC), range(sizeZ)):
        c_name = channel_names[iC]
        for img_key, img in zip(["median", "mean"],
                                [median_images, mean_images]):
            fig, axes = plt.subplots(1, 6, figsize=(18, 3))
            ps = []
            j = sizeT // 2
            ims = [img[i, iC, iZ] for i in (0, j, -1)]
            ps.append(axes[0].imshow(ims[0]))
            ps.append(axes[1].imshow(ims[1]))
            ps.append(axes[2].imshow(ims[2]))
            ps.append(axes[3].imshow(ims[1] - ims[0]))
            ps.append(axes[4].imshow(ims[1] - ims[2]))
            for p, ax in zip(ps, axes):
                fig.colorbar(p, ax=ax)
            axes[5].plot(ims[0].flatten(), ims[-1].flatten(), ".")
            axes[0].set_title("at time 0")
            axes[1].set_title(f"at time {j}")
            axes[2].set_title(f"at time {iT-1}")
            axes[3].set_title(f"diff at time {j} and 0")
            axes[4].set_title(f"diff at time {j} and {iT-1}")
            fig.tight_layout()
            savefig(
                fig,
                f"6_background_correlation_C{iC}_{c_name}_Z{iZ}_{img_key}.png")
            plt.close("all")

    ############## summarize and save backgrounds ##############
    background_directory = path.join(output_dir, "averaged_background")
    os.makedirs(background_directory, exist_ok=True)
    for iC, iZ in itertools.product(range(sizeC), range(sizeZ)):
        c_name = channel_names[iC]
        for img_key, img in zip(["median", "mean"],
                                [median_images, mean_images]):
            filename = f"{img_key}_C{iC}_{c_name}_Z{iZ}"
            averaged_img = np.nanmean(img[:, iC, iZ], axis=0)

            fig, ax = plt.subplots(1, 1, figsize=(5, 5))
            p = ax.imshow(averaged_img)
            fig.colorbar(p, ax=ax)
            savefig(fig, f"7_time_averaged_background_{filename}.pdf")
            plt.close("all")

            io.imsave(
                path.join(background_directory, filename + ".tiff"),
                averaged_img,
                check_contrast=False,
            )

    params_path = path.join(background_directory,
                            "calculate_background_params.yaml")
    with open(params_path, "w") as f:
        yaml.dump(params_dict, f)

    image_props = {
        "channel_names": channel_names,
        "pixel_sizes": pixel_sizes,
        "sizeS": sizeS,
        "sizeT": sizeT,
        "sizeC": sizeC,
        "sizeZ": sizeZ,
        "sizeY": sizeY,
        "sizeX": sizeX,
    }
    image_props_path = path.join(output_dir, "image_props.yaml")
    with open(image_props_path, "w") as f:
        yaml.dump(image_props, f)
#########################################################################3
#           Example:  Initializing IPyParallel
#
#                     This example demonstrates how to access the individual
#                     ipython engines running within the cluster.
#

import ipyparallel
import os
import socket

#Create a client instance, used to connect the controller to the remote engines
rc = ipyparallel.Client(profile='crestone-cpu')
nengines = len(rc)

#create direct views into each engine
all_proc = rc[:]  # all_proc is a list of ipython DirectView objects

#Only the controller prints this
print('\n ', nengines, " Python engines are active.\n")

# Each Python engine calls the gethostname and getpid functions
hostnames = all_proc.apply_sync(socket.gethostname)
pids = all_proc.apply_sync(os.getpid)

for i in range(nengines):
    istr = '{:02d}'.format(i)  # returns a 2-digit string whose value is i
    pstr = str(pids[i])
    hstr = str(hostnames[i])
    msg = 'Engine ' + istr + ':   pid = ' + pstr + ';  hostname =' + hstr
    print(msg)
Exemple #17
0
    grid = ns.grid
    partition = ns.partition
    Lx = ns.Lx
    Ly = ns.Ly
    c = ns.c
    tstop = ns.tstop
    if ns.save:
        user_action = wave_saver
    else:
        user_action = None

    num_cells = 1.0 * (grid[0] - 1) * (grid[1] - 1)
    final_test = True

    # create the Client
    rc = ipp.Client(profile=ns.profile)
    num_procs = len(rc.ids)

    if partition is None:
        partition = [1, num_procs]

    assert (partition[0] * partition[1] == num_procs
            ), "can't map partition %s to %i engines" % (partition, num_procs)

    view = rc[:]
    print("Running %s system on %s processes until %f" %
          (grid, partition, tstop))

    # functions defining initial/boundary/source conditions
    def I(x, y):
        from numpy import exp
Exemple #18
0
def svd4tet(data, nboots=100, method="all", nquarts=None, force=False):
    """ 
    API wrapper for svd4tet analysis

    data 
        ipyrad Assembly object
    nboots
        number of non-parametric bootstrap replicates to run
    method
        all, random, or equal. Default is all, which samples all possible
        quartets. For very large trees (>50 tips) this may take too long, 
        in which case you should use random or equal. The arg nquarts 
        determines how many quartets will be samples. In random, nquarts
        are sampled and used. In equal, a starting tree is inferred and
        the random quartets are drawn so that they are spread ~equally
        across splits of the tree. 
    nquarts 
        The numer of random quartets sampled in random or equal method. 
        Default is 10000, or all if all < 10000. 
    force
        Overwrite existing
    """

    ## check that method was entered correctly
    assert method in ["all", "random", "equal"], \
        "method type not recognized, must be one of ['all', 'random', 'equal']"

    if method != "all":
        ## require nquarts if method not all
        assert nquarts, "if method != all, must enter a value for nquarts"
        ## don't allow nquarts to be greater than all
        totalquarts = n_choose_k(len(data.samples), 4)
        if nquarts > totalquarts:
            print("  nquarts > total quartets, switching to method='all'")
            method = "all"
        if nquarts < 500:
            print("  few possible quartets, only method='all' available")
            method = "all"

    ## launch ipclient, assumes ipyparallel is running
    ipyclient = ipp.Client(timeout=10)

    ## protects it from KBD
    try:
        run(data, nboots, method, nquarts, force, ipyclient)

    except (KeyboardInterrupt, SystemExit):

        ## protect from KBD while saving
        try:
            ## cancel submitted jobs
            #ipyclient.abort()
            ## kill running jobs
            #ipyclient.close()

            ## remove any abandoned tmp arrays 
            abandon = glob.glob(os.path.join(data.dirs.svd, "*_tmp_*.h5"))
            for afile in abandon:
                os.remove(afile)

        except KeyboardInterrupt:
            pass

    finally:
        ## checkpoint the state and save
        LOGGER.info("\n  saving checkpoints to [Assembly].svd")
        LOGGER.info("  array checkpoint: %s", data.svd.checkpoint_arr)
        LOGGER.info("  boot checkpoint: %s", data.svd.checkpoint_boot)
        data.save()        
Exemple #19
0
    def wait_for_connection(self):
        """ 
        Creates a client to view ipcluster engines for a given profile and 
        returns it with at least one engine spun up and ready to go. If no 
        engines are found after nwait amount of time then an error is raised.
        If engines==MPI it waits a bit longer to find engines. If the number
        of engines is set then it waits even longer to try to find that number
        of engines.
        """
        # save stds for later, hide here to prevent ipp enforced print()
        save_stdout = sys.stdout 
        save_stderr = sys.stderr
        sys.stdout = StringIO()
        sys.stderr = StringIO()

        # wrapped search for ipcluster
        try: 
            args = {
                "profile": self.tool.ipcluster["profile"],
                "timeout": self.tool.ipcluster["timeout"],
                "cluster_id": self.tool.ipcluster["cluster_id"],            
            }
            ipyclient = ipp.Client(**args)

            # restore std printing now that Client print statement has passed
            # sys.stdout = save_stdout
            # sys.stderr = save_stderr

            # allow time to find the connection; count cores to break
            for _ in range(6000):

                # how many cores can we find right now?
                ncores = len(ipyclient)
                self.update_message(
                    "Establishing parallel connection: {} cores"
                    .format(ncores))
                time.sleep(0.01)

                # If we know ncores, then wait for all 
                print(self.tool.ipcluster["cores"])
                if self.tool.ipcluster["cores"]:
                    time.sleep(0.1)
                    if ncores == self.tool.ipcluster["cores"]:
                        break

                # Looking for all available cores, auto stop 
                else:

                    # If MPI and not all found break if no more in 3 secs
                    if self.tool.ipcluster["engines"] == "MPI":
                        # are any cores found yet? do long wait.
                        if ncores:
                            time.sleep(3)
                            if len(ipyclient) == ncores:
                                break

                    # if Local then wait 1 second between checks
                    else:
                        if ncores:
                            time.sleep(1.)
                            if len(ipyclient) == ncores:                            
                                break

        except KeyboardInterrupt as inst:
            raise inst

        except (IOError, OSError, ipp.TimeoutError, ipp.NoEnginesRegistered):
            raise IPyradError(
                "\nipcluster not found, use 'auto=True' or see docs.")

        finally:
            # no matter what we reset the stds
            sys.stdout = save_stdout
            sys.stderr = save_stderr

        # self.update_message(
            # "Parallel connection: {}".format(len(ipyclient)))

        return ipyclient
Exemple #20
0
            agent.salience = random.random()
        new_model.step()
    return new_model


# Load data
book_data = pd.read_csv("BDM_ColdWar.csv")
book_data.Position = (book_data.Position + 100) / 200

agents = []
for i, row in book_data.iterrows():
    new_agent = BDMActor(row.Country, row.Capability, row.Position, 1)
    new_agent.decision_model.Q = 0.5
    new_agent.decision_model.T = 0.5
    agents.append(new_agent)
model = NegotiationModel_(agents)

clients = ipyparallel.Client()
print(clients.ids)
dview = clients[:]

with dview.sync_imports():
    import copy
    import random

all_models = dview.map_sync(run_model, [model] * 10, [25] * 10)
all_model_out = [Model_Output(m) for m in all_models]
with open("ColdWar_Experiment2_1.pickle", "wb") as f:
    pickle.dump(all_model_out, f)
print("Done!")
Exemple #21
0
def parallel_serv_start():
    import subprocess
    subprocess.Popen(['ipcluster', 'start'])
    import ipyparallel as ipp
    rc = ipp.Client()
'''
TODO!!! Could we use doxygen or similar to convert docstrings to .org and thus avoid the
need to manually syncronise the code and the slides?
'''
'''This simply fills the non-ghosted part of the lattice with squares of consequtive
integers, stsrting from rank number, does the ghost exchange, computes gradients (2nd
order central differences) in the non-ghosted area, calculates maxima of local gradients,
and Allreduces the global maximum.

Along the way, it prints some diagnostics about the lattice and eventually of the
gradients.
'''

try:
    import ipyparallel
    c = ipyparallel.Client(profile="mpi")
    directview = c[:]
    directview.block = True
except IOError:
    try:
        del ipyparallel
    except:
        pass
    import mpi4py
    from mpi4py import MPI
except ImportError:
    try:
        del ipyparallel
    except:
        pass
    import mpi4py
Exemple #23
0
# ======== Header for ipyparallel kernels  ========
import os, sys, types
import ipyparallel as ipp

# -------- Parallel kernels --------
print("Initializing cluster ...")

# variables
global kernels, cluster, nKernels
kernels = ipp.Client()
print("   Client variable \'kernels\'")
cluster = kernels[:]
print("   Cluster Direct View variable \'cluster\'")
nKernels = len(kernels.ids)
print("   Variable \'nKernels\' =", nKernels)


# change cluster current working directory
def f(cwd):
    os.chdir(cwd)
    print(os.getcwd)
    return


cwd = os.getcwd()
cwdList = []
for i in range(nKernels):
    cwdList.append(cwd)
with cluster.sync_imports():
    import os
Exemple #24
0
    dataHDDM["stim"] = dataHDDM.apply(lambda row: 1 if row['stim'] == 'Right' else 0, axis=1)
    dataHDDM["response"] = dataHDDM.apply(lambda row: 1 if row['givenResp'] == 'Right' else 0, axis=1)

    def v_link_func(x, data=dataHDDM):
        stim = (np.asarray(dmatrix('0 + C(s, [[1], [-1]])',
                               {'s': data.stim.ix[x.index]})))
        return x*stim
    if id < 4:
        ############## M1
        LM = [{'model':'t ~ SAT  + FC + contrast + SAT:FC + SAT:contrast + FC:contrast + SAT:FC:contrast', 'link_func': lambda x: x} ,
			  {'model':'v ~ contrast', 'link_func':v_link_func} ,
			  {'model':'a ~ FC + SAT + SAT:FC', 'link_func': lambda x: x} ]
	deps = {'sz' : 'SAT'}
        inc = ['sv','sz','st','z']
        model_name = "Joint_t0"
    else :
        return np.nan()
    name = 'light_reg_PMT_%s' %str(id)
    m = hddm.HDDMRegressor(dataHDDM, LM , depends_on = deps,
            include=inc, group_only_nodes=['sv', 'sz','st', "sz_SAT"], group_only_regressors=False, keep_regressor_trace=True)
    m.find_starting_values()
    m.sample(iter=10000, burn=8500, thin=1, dbname='DDM/traces/db_%s'%name, db='pickle')
    m.save('DDM/Fits/%s'%name)
    return m

v = ipyparallel.Client(profile="reg_PMT")[:]#sept
jobs = v.map(run_model, range(4 * 1))#4 chains for each model
wait_watching_stdout(jobs)
models = jobs.get()

    iter_list = create_list()
    scores = list(dview.map(parallel_method, iter_list).get())
    #score_parameter_pairs = zip(scores,iter_list)

    #print(iter_list)


from neuronunit import tests
#from deap import hypervolume

#test_0_run_exhaust()

os.system('ipcluster start -n 8 --profile=default & sleep 5;')
import ipyparallel as ipp

rc = ipp.Client(profile='default')
rc[:].use_cloudpickle()
dview = rc[:]


class ReducedModelTestCase(unittest.TestCase):
    """Test instantiation of the reduced model"""
    """Testing model optimization"""
    def setUp(self):
        #import sys
        #sys.path.append('../')
        #import neuronunit

        from neuronunit.models.reduced import ReducedModel
        #self.ReducedModel = ReducedModel
        #path = ReducedModelTestCase().path
Exemple #26
0
def run_cnmfe(tiff_files, param_file, output_file):
    """ Run the CNMFe algorithm through CaImAn.

    :param tiff_files: A list of .tiff files corresponding to a calcium imaging movie.
    :param param_file: A .yaml parameter file, containing values for the following parameters:
        num_processes : int
            The number of processes to run in parallel. The more parallel processes, the more memory that is used.
        rf : array-like
            An array [half-width, half-height] that specifies the size of a patch.
        stride : int
            The amount of overlap in pixels between patches.
        K : int
            The maximum number of cells per patch.
        gSiz : int
            The expected diameter of a neuron in pixels.
        gSig : int
            The standard deviation a high pass Gaussian filter applied to the movie prior to seed pixel search, roughly
            equal to the half-size of the neuron in pixels.
        min_pnr : float
            The minimum peak-to-noise ratio that is taken into account when searching for seed pixels.
        min_corr : float
            The minimum pixel correlation that is taken into account when searching for seed pixels.
        min_SNR : float
            Cells with an signal-to-noise (SNR) less than this are rejected.
        rval_thr : float
            Cells with a spatial correlation of greater than this are accepted.
        decay_time : float
            The expected decay time of a calcium event in seconds.
        ssub_B : int
            The spatial downsampling factor used on the background term.
        merge_threshold : float
            Cells that are spatially close with a temporal correlation of greater than merge_threshold are automatically merged.
    :param output_file: The path to a .hdf5 file that will be written to contain the traces, footprints, and deconvolved
        events identified by CNMFe.
    """

    for tiff_file in tiff_files:
        if not os.path.exists(tiff_file):
            raise FileNotFoundError(tiff_file)

    if not os.path.exists(param_file):
        raise FileNotFoundError(param_file)

    with open(param_file, 'r') as f:
        params = yaml.load(f)

    expected_params = [
        'gSiz', 'gSig', 'K', 'min_corr', 'min_pnr', 'rf', 'stride',
        'decay_time', 'min_SNR', 'rval_thr', 'merge_threshold', 'ssub_B',
        'frame_rate', 'num_rows', 'num_cols', 'num_frames', 'num_processes'
    ]

    for pname in expected_params:
        if pname not in params:
            raise ValueError('Missing parameter {} in file {}'.format(
                pname, param_file))

    gSiz = params['gSiz']
    gSig = params['gSig']
    K = params['K']
    min_corr = params['min_corr']
    min_pnr = params['min_pnr']
    rf = params['rf']
    stride = params['stride']
    decay_time = params['decay_time']
    min_SNR = params['min_SNR']
    rval_thr = params['rval_thr']
    merge_threshold = params['merge_threshold']
    ssub_B = params['ssub_B']
    frame_rate = params['frame_rate']
    num_rows = params['num_rows']
    num_cols = params['num_cols']
    num_frames = params['num_frames']
    num_processes = params['num_processes']

    # write memmapped file
    print('Exporting .isxd to memmap file...')
    mmap_file = _export_movie_to_memmap(tiff_files,
                                        num_frames,
                                        num_rows,
                                        num_cols,
                                        overwrite=False)
    print('Wrote .mmap file to: {}'.format(mmap_file))

    # open memmapped file
    Yr, dims, T = load_memmap(mmap_file)
    Y = Yr.T.reshape((T, ) + dims, order='F')

    # grab parallel IPython handle
    dview = None
    if num_processes > 1:
        import ipyparallel as ipp
        c = ipp.Client()
        dview = c[:]
        print('Running using parallel IPython, # clusters = {}'.format(
            len(c.ids)))
        num_processes = len(c.ids)

    # initialize CNMFE parameter object and set user params
    cnmfe_params = CNMFParams()

    if gSiz is None:
        raise ValueError(
            'You must set gSiz to an integer, ideally roughly equal to the expected half-cell width.'
        )
    gSiz = _turn_into_array(gSiz)

    if gSig is None:
        raise ValueError(
            'You must set gSig to a non-zero integer. The default value is 5.')
    gSig = _turn_into_array(gSig)

    cnmfe_params.set('preprocess', {'p': 1})

    cnmfe_params.set(
        'init', {
            'K': K,
            'min_corr': min_corr,
            'min_pnr': min_pnr,
            'gSiz': gSiz,
            'gSig': gSig
        })

    if rf is None:
        cnmfe_params.set('patch', {'rf': None, 'stride': 1})
    else:
        cnmfe_params.set('patch', {'rf': np.array(rf), 'stride': stride})

    cnmfe_params.set('data', {'decay_time': decay_time})

    cnmfe_params.set('quality', {'min_SNR': min_SNR, 'rval_thr': rval_thr})

    cnmfe_params.set('merging', {'merge_thr': merge_threshold})

    # set parameters that force CNMF into one-photon mode with no temporal or spatial downsampling,
    # except for the background term
    cnmfe_params.set(
        'init', {
            'center_psf': True,
            'method_init': 'corr_pnr',
            'normalize_init': False,
            'nb': -1,
            'ssub_B': ssub_B,
            'tsub': 1,
            'ssub': 1
        })
    cnmfe_params.set(
        'patch', {
            'only_init': True,
            'low_rank_background': None,
            'nb_patch': -1,
            'p_tsub': 1,
            'p_ssub': 1
        })
    cnmfe_params.set('spatial', {
        'nb': -1,
        'update_background_components': False
    })
    cnmfe_params.set('temporal', {'nb': -1, 'p': 1})

    # construct and run CNMFE
    print('Running CNMFe...')
    cnmfe = CNMF(num_processes, dview=dview, params=cnmfe_params)
    cnmfe.fit(Y)

    # run auto accept/reject
    print('Estimating component quality...')
    idx_components, idx_components_bad, comp_SNR, r_values, pred_CNN = estimate_components_quality_auto(
        Y,
        cnmfe.estimates.A,
        cnmfe.estimates.C,
        cnmfe.estimates.b,
        cnmfe.estimates.f,
        cnmfe.estimates.YrA,
        frame_rate,
        cnmfe_params.get('data', 'decay_time'),
        cnmfe_params.get('init', 'gSiz'),
        cnmfe.dims,
        dview=None,
        min_SNR=cnmfe_params.get('quality', 'min_SNR'),
        use_cnn=False)

    save_cnmfe(cnmfe, output_file, good_idx=idx_components)
Exemple #27
0
    def optimize(
        self,
        method,
        quantiles=(0.1, 0.3, 0.5, 0.7, 0.9),
        n_runs=3,
        n_bootstraps=0,
        parallel_profile=None,
    ):
        """
        Optimize model using ML, chi^2 or G^2.

        :Input:
            method : str
                Optimization method ('ML', 'chisquare' or 'gsquare').

            quantiles : tuple
                A sequence of quantiles to be used for chi^2 and G^2.
                Default values are the ones used by Ratcliff (.1, .3, .5, .7, .9).

            n_runs : int <default=3>
                Number of attempts to optimize.

            n_bootstraps : int <default=0>
                Number of bootstrap iterations.

            parrall_profile : str <default=None>
                IPython profile for parallelization.

        :Output:
            results <dict> - a results dictionary of the parameters values.

        :Note:
            The values of the nodes in single subject model is updated according to the results.
            The nodes of group models are not updated
        """

        results = self._run_optimization(method=method,
                                         quantiles=quantiles,
                                         n_runs=n_runs)

        # bootstrap if requested
        if n_bootstraps == 0:
            return results

        # init DataFrame to save results
        res = pd.DataFrame(np.zeros((n_bootstraps, len(self.values))),
                           columns=list(self.values.keys()))

        # prepare view for parallelization
        if parallel_profile is not None:  # create view
            client = ipyparallel.Client(profile=parallel_profile)
            view = client.load_balanced_view()
            runs_list = [None] * n_bootstraps
        else:
            view = None

        # define single iteration bootstrap function
        def single_bootstrap(
            data,
            accumulator_class=self.__class__,
            class_kwargs=self._kwargs,
            method=method,
            quantiles=quantiles,
            n_runs=n_runs,
        ):

            # resample data
            new_data = data.iloc[np.random.randint(0, len(data), len(data))]
            new_data = new_data.set_index(pd.Index(list(range(len(data)))))
            h = accumulator_class(new_data, **class_kwargs)

            # run optimization
            h._run_optimization(method=method,
                                quantiles=quantiles,
                                n_runs=n_runs)

            return pd.Series(h.values, dtype=np.float)

        # bootstrap iterations
        for i_strap in range(n_bootstraps):
            if view is None:
                res.iloc[i_strap] = single_bootstrap(self.data)
            else:
                # append to job queue
                runs_list[i_strap] = view.apply_async(single_bootstrap,
                                                      self.data)

        # get parallel results
        if view is not None:
            view.wait(runs_list)
            for i_strap in range(n_bootstraps):
                res.iloc[i_strap] = runs_list[i_strap].get()

        # get statistics
        stats = res.describe()
        for q in [2.5, 97.5]:
            stats = stats.append(
                pd.DataFrame(res.quantile(q / 100.0),
                             columns=[repr(q) + "%"]).T)

        self.bootstrap_stats = stats.sort_index()
        return results
Exemple #28
0
        G['industry_imputed']
    except:
        G.vs['industry_imputed'] = [x == 'nan' for x in G.vs['industry']]

    industry_dist = np.array(
        [x['industry'] for x in G.vs if not x['industry_imputed']])
    imputed_industry = np.random.choice(industry_dist,
                                        len(G.vs(industry_imputed_eq=True)),
                                        replace=True)
    for v, s in zip(G.vs(industry_imputed_eq=True), imputed_industry):
        v['industry'] = s


has_ipyparallel = True
try:
    dv = ipyparallel.Client(
    )[:]  # This should be global (or a singleton) to avoid an error with too many files open https://github.com/ipython/ipython/issues/6039
    dv.block = False
    dv.use_dill()
except:
    has_ipyparallel = False
    print("Loading without ipyparallel support")

callbacks = [
    some_terminal_suppliers_reachable,
    percent_terminal_suppliers_reachable,
]


def failure_reachability_single(r,
                                G,
                                med_suppliers=False,
import bubbles

# Dynesty imports
import pickle
import dynesty
from dynesty import plotting as dyplot
from dynesty import DynamicNestedSampler
from dynesty import utils as dyfunc

from multiprocessing import Pool
import ipyparallel as ipp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# To use multiprocessing run following command:
# > ipcluster start -n 7 &
rc = ipp.Client()
nprocs = len(rc.ids)
print(rc.ids)
dview = rc[:]
dview.use_dill()

# =====================================================================
import argparse  # argument managing

# ==============================================================================
# Managing arguments with argparse (see http://docs.python.org/howto/argparse.html)
parser = argparse.ArgumentParser()
# ---- required arguments ---- :
parser.add_argument("file_name",
                    type=str,
                    help="File name, saved in ../chains/")
Exemple #30
0
    # Main processing loop
    if serial_flag:
        print 'Running everything in a serial loop.'
        # Serial processing
        out = []
        for ag in args:
            out.append(parallel_loop(algo_names, parameters, ag))

    else:
        import ipyparallel as ip

        print 'Using ipyparallel processing.'

        # Start the parallel processing
        c = ip.Client()
        NC = len(c.ids)
        print NC, 'workers on the job'

        # replicate some parameters
        algo_names_ls = [algo_names] * len(args)
        params_ls = [parameters] * len(args)

        # evaluate the runtime
        then = time.time()
        out1 = c[:].map_sync(parallel_loop, algo_names_ls[:NC], params_ls[:NC],
                             args[:NC])
        now = time.time()
        one_loop = now - then
        print 'Total estimated processing time:', len(args) * one_loop / len(
            c[:])