Example #1
0
def pairs_construction(seqs: List[List[Union[str, int]]], window_size: int = 2,
                       drop_duplicates: bool = True,
                       n_jobs: int = 4, **kwargs):
    """
    Helper function to make pairs from sequences in parallel
    Parameters
    ----------
    seqs : input sequences of nodes
    window_size : int, default is 2
    drop_duplicates : bool, default if True
        Delete pairs where both elements are the same
    n_jobs : int, default is 4
        Number of workers to be created in parallel pool

    Returns
    -------
    List of pairs of nodes as <cur_vertex, context_vertex>
    """
    set_new_config(window_size=window_size, **kwargs)
    local_logger = logging.getLogger(f"{__name__}")
    max_processes = max(n_jobs, os.cpu_count())
    pairs_pool = ProcessPool(nodes=max_processes)
    pairs_pool.terminate()
    pairs_pool.restart()
    local_logger.info("Started making pairs from the sequences.")
    pairs = pairs_pool.map(_make_pairs, seqs)
    local_logger.info(f"Total number of raw sampled pairs is {len(pairs)}")
    if drop_duplicates:
        pairs = [item for sublist in pairs for item in sublist if item[0] != item[1]]
    else:
        pairs = [item for sublist in pairs for item in sublist]
    pairs = [item for item in pairs if (item[0] != -3) & (item[1] != -3)]
    pairs_pool.terminate()
    pairs_pool.restart()
    return pairs
Example #2
0
    def update_hash_dict(self):
        if self.num_proc is None:
            self.num_proc = cpu_count() - 1

        # check current hash_dict
        current_files = set(self.image_filenames)
        cache_files = self.hash_dict.keys()
        lost_set = cache_files - current_files
        target_files = list(current_files - cache_files)

        if len(lost_set) + len(target_files) > 0:
            try:
                if len(self.hash_dict) == 0:
                    spinner = Spinner(
                        prefix=
                        "Calculating image hashes (hash-bits={} num-proc={})..."
                        .format(self.hash_bits, self.num_proc))
                else:
                    spinner = Spinner(
                        prefix=
                        "Updating image hashes (hash-bits={} num-proc={})...".
                        format(self.hash_bits, self.num_proc))
                spinner.start()

                # del lost_set from hash_dict
                for f in lost_set:
                    del self.hash_dict[f]

                if six.PY2:
                    from pathos.multiprocessing import ProcessPool as Pool
                elif six.PY3:
                    from multiprocessing import Pool
                pool = Pool(self.num_proc)
                hashes = pool.map(self.gen_hash, target_files)
                for filename, hash_value in zip(target_files, hashes):
                    self.hash_dict[filename] = hash_value
                spinner.stop()
            except KeyboardInterrupt:
                pool.terminate()
                pool.join()
                spinner.stop()
                sys.exit(1)
            return True
        else:
            return False
Example #3
0
    def make_hash_list(self):
        if self.num_proc is None:
            self.num_proc = cpu_count() - 1

        try:
            spinner = Spinner(
                prefix="Calculating image hashes (hash-bits={} num-proc={})..."
                .format(self.hash_bits, self.num_proc))
            spinner.start()
            if six.PY2:
                from pathos.multiprocessing import ProcessPool as Pool
            elif six.PY3:
                from multiprocessing import Pool
            pool = Pool(self.num_proc)
            self.cache = pool.map(self.gen_hash, self.image_filenames)
            spinner.stop()
        except KeyboardInterrupt:
            pool.terminate()
            pool.join()
            spinner.stop()
            sys.exit(1)
Example #4
0
def main(args):
    if len(args.input) < 2:
        print("Please name at least one STAR file and an output directory")
        return 1

    if args.apix is None:
        print("Using pixel size computed from STAR files")

    def do_job(star):
        try:
            mrc = os.path.join(args.output,
                               os.path.basename(star).replace(".star", ".mrc"))
            print("Starting reconstruction of %s" % star)
            do_reconstruct(star, mrc, args.apix, args.sym, args.ctf)
            print("Wrote %s reconstruction to %s" % (star, mrc))
            if args.mask is not None:
                masked_mrc = mrc.replace(".mrc", "_masked.mrc")
                do_mask(mrc, masked_mrc, args.mask)
                print("Wrote masked map %s" % masked_mrc)
            if args.mask is not None and args.delete_unmasked:
                delete_unmasked(mrc, masked_mrc)
                print("Overwrote %s with %s" % (mrc, masked_mrc))
        except Exception as e:
            print("Failed on %s" % star)
        return 0

    pool = Pool(nodes=args.nproc)

    #pool.apipe(do_job, args.input)
    results = pool.imap(do_job, args.input)
    codes = list(results)

    if pool is not None:
        pool.close()
        pool.join()
        pool.terminate()

    return 0
Example #5
0
        catalogs = pool.map(catfunc, np.arange(Niter))
        os.system("rm " + filename)
        print("Generated catalogs")
        #catalogs = model.abundance_match(alpha, scatter, Niter)
        cov_matrix, mean = model.stoch_covmat_mean(catalogs, nthreads=ncore)

        means[i, j, :] = mean
        covmats[i, j, :, :] = cov_matrix

        t = time() - start
        extime.append(t)
        remtime = sum(extime) / len(extime) * (Ntot - k) / 60**2
        print(
            "Done with step {}/{} in time {:.1f}. Estimated remaining time is {:.2f} hours"
            .format(k, Ntot, t, remtime))
        sys.stdout.flush()

        k += 1

pool.close()
pool.join()
pool.terminate()
res = {'alpha': XX, 'scatter': YY, 'covmat': covmats, 'wp': means}

p.dump_pickle(
    res, "../../Data/NSAmatching/Train_stoch_covmats_{}_.p".format(logSMlim))

print("Finished")
sys.stdout.flush()
Example #6
0
def graph_sampling(graph: FSN, strategy: Optional[str] = "MetaDiff",
                   n_jobs: Optional[int] = 4,
                   use_cache: Optional[bool] = True, **kwargs) \
        -> List[List[Union[str, int]]]:
    """
    Sampling the sequences of nodes from FSN w.r.t. chosen strategy
    Parameters
    ----------
    graph : FSN object
        Graph to be processed
    strategy : str, default is 'MetaDiff'
        Walking strategy to be used
    n_jobs : int, default is 4
        Number of workers to be created in parallel pool
    use_cache : bool, default is True
        To use the previously cached files

    Returns
    -------
    Sampled sequences of BP nodes
    """
    set_new_config(**kwargs)
    local_logger = logging.getLogger(f"{__name__}")
    if use_cache and os.path.isfile(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl"):
        local_logger.info("Loading sequences from cache... wait...")
        try:
            with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "rb") as file:
                res = pickle.load(file)
            local_logger.info(f"Total number of raw sampled sequences is {len(res)}")
            local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}")
            return res
        except FileNotFoundError:
            local_logger.info("File not found... Recalculate \n")
            pass
        except Exception as e:
            local_logger.error(f"Unexpected error: {e}")
    local_logger.info("Sampling sequences... wait...")
    max_processes = max(n_jobs, os.cpu_count())
    global walk
    if strategy in strategy_to_class.keys():
        walk = strategy_to_class[strategy](G=graph, walk_length=CONFIG.WALKS_LENGTH,
                                           direction=CONFIG.DIRECTION,
                                           pressure=CONFIG.PRESSURE, allow_back=CONFIG.ALLOW_BACK)
    else:
        raise KeyError(
            f"The given strategy {strategy} is unknown. The following ones are implemented: {strategy_to_class.keys()}")
    sampling_pool = ProcessPool(nodes=max_processes)
    local_logger.info("Created a Pool with " + str(max_processes) + " processes ")
    # required to restart pool to update CONFIG inside the parallel part
    sampling_pool.terminate()
    sampling_pool.restart()
    BPs = graph.get_BPs()
    n_BPs = len(BPs)
    sampled = list()
    try:
        with tqdm(total=n_BPs) as pbar:
            for i, res in enumerate(sampling_pool.uimap(wrappedWalk, BPs)):
                sampled.append(res)
                pbar.update()
    except KeyboardInterrupt:
        print('Got ^C while pool mapping, terminating the pool')
        sampling_pool.terminate()
    res = list(itertools.chain(*sampled))
    sampling_pool.terminate()
    sampling_pool.restart()
    local_logger.info("Cashing sampled sequences!")
    if use_cache:
        with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "wb") as file:
            pickle.dump(res, file)
    local_logger.info(f"Total number of raw sampled sequences is {len(res)}")
    local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}")
    return res
Example #7
0
def build_pt(sampler_class, pe_method, force_method, numdim = 5, masses = 1.0, \
    nT = 10, nproc = 1, Tmin = 1.0, Tmax = 100.0, max_iteration = 500, iters_to_swap = 1, \
    iters_to_waypoint = 5, iters_to_setdt = 10, iters_to_writestate = 1, run_token = 1, \
    dt = 1.0e-4, traj_len = 100, num_traj = 10, absxmax = 1.0e2, initial_rand_bounds = 1.0e2, \
    dt_max = None, min_rate = 0.6, max_rate = 0.7, gaussianprior_std = None):
    """Builds an instance of ParallelTempering. Reads restart file if it exists, or initialises a 
    fresh run.
    
    Args:
        sampler_class : Sampler class from module sampling. Eg. sampling.Hmc .
        pe_method : A method for evaluating the potential energy.
        force_method : A method for evaluating the forces.
        numdim (int) :The number of dimensions of the configuration space ('parameter space'). 
            (Defualt: 5)
        masses (single float or numpy array of floats, with length 1 or length numdim): specifies the
            masses associated with each dimension of the configuration space ('parameter space'). 
            (Default: 1.0)
        nT (int) : Number of temperatures to use. (Default: 10)
        nproc (int) : Number of processors to use. (Default: 1)
        Tmin (float) : Lowest temperature in ladder of temperatures. (Default 1.0)
        Tmax (float) : Maximum temperature in ladder of temperatures. (Default 100.0).
        max_iteration (int) : Max number of iterations to run. (Default 500).
        iters_to_swap (int) : Configuration swaps between neighbouring temperatures are attempted
            every iters_to_swap iterations. (Default 1).
        iters_to_waypoint (int) : Restart information is written after every iters_to_waypoint 
            iterations. (Default 5). 
        iters_to_setdt (int) : The step sizes (or equivalently time steps) are updated after every 
            iters_to_setdt interations. (Default 10).
        iters_to_writestate (int) : The latest potential energy values and coordinates are written
            out after every iters_to_writestate iterations. (Default 1).
        run_token (int) : An integer for labelling the restart and output files for this calculation.
            (Default 1).
        dt (float) : Initial time step (or step size). This will be updated algorithmically, but a 
            good starting point saves time. (Default 1.0e-4).
        traj_len (int) : The number of time steps in a single trajectory. (Default 100).
        num_traj (int) : The number of trajectories run per iteration, per sampler. (Default 10).
        absxmax (single float or numpy array of floats, with length 1 or length numdim) : During the 
            main calculation, the sampler is restricted to a region x in [-absxmax,absxmax]. 
            (Default: 1.0e2).
        initial_rand_bounds : The same as absxmax, but applied only during random initialisation of the
            sampler's coordinate (parameters). This enables initialisation into a particular region, 
            which might for example, be most likely to contain the global minimum. (Default: 1.0e2).
        dt_max (float) : maximum step size (time step). (Default: median(absxmax), which is set in 
            module sampling.)
        min_rate (float) : minimum acceptance rate of trajectories. Used for setting step size (time 
            step). (Default: 0.6. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65
            http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3).
        max_rate (float) : maximum acceptance rate of trajectories. Used for setting step size (time 
            step). (Default 0.7. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65
            http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3).
        gaussianprior_std (single float or numpy array of floats, with length 1 or length numdim) : If 
            this is set to a real value then an additional term is applied to (H)MC acceptance/rejection 
            such that the target distribution is proportional to a multivariate Gaussian with this 
            standard deviation for each dimension. (Default: None.)

    Return:
        ParallelTempering class object

    """

    # CHECK FOR RESTART FILE AND DO RESTART IF PRESENT
    restrtfl = "restart_pt_" + str(run_token) + ".txt"
    if os.path.isfile("./" + restrtfl):  # read restart data from restart file
        didrestart = True

        print "Restarting from file ", restrtfl, time.ctime()
        nT, Tmin, Tmax, iteration, num_traj, samplers, walkers = \
            read_waypoint(restrtfl, sampler_class, pe_method, force_method)

    else:
        didrestart = False
        iteration = 0
        # a list of new walkers (which are class objects)
        samplers = build_samplers( sampler_class, pe_method, force_method, nT, Tmin, Tmax, dt, \
            traj_len, absxmax, dt_max, min_rate, max_rate, gaussianprior_std )

        print "Start initialise walkers ", time.ctime()
        walkers = np.asarray([])

        sampling.NewWalker.masses = masses
        sampling.NewWalker.numdim = numdim
        temp_pool = ProcessPool(nodes=nproc)

        # temporarily pass initial_random_bounds through samplers, since pathos multiprocessing is
        # restrictive with arguments
        for sampler in samplers:
            sampler.random_init_bounds = initial_rand_bounds

        outs = sampling.apply_pool(temp_pool, initialise_walker, samplers)
        for i in xrange(len(outs)):
            walkers = np.append(walkers, outs[i][0])
            samplers[i] = outs[i][1]

        temp_pool.terminate()  # close pool
        temp_pool.restart()  # close pool
        print "Done initialise walkers ", time.ctime()

    coutfl = "ptconfsout_" + str(run_token) + ".txt"
    ptoutfl = "ptout_" + str(run_token) + ".txt"

    thispt = ParallelTempering(samplers, walkers, num_traj, nT, nproc, Tmin, Tmax, iteration, \
        max_iteration, iters_to_swap, iters_to_waypoint, iters_to_setdt, iters_to_writestate, run_token, coutfl,\
        ptoutfl, restrtfl )

    if (not didrestart):
        thispt.set_dt_all(thispt.pt_pool, step_fac=0.1)

    return thispt
Example #8
0
    def set(self,walkers,message_prefix, adjust_step_factor = 0.9):
        """Updates the stepsize to achieve a trajectory acceptance rate in or as close as possible
        to the range [self.sampler.min_rate, self.sampler.max_rate], with stepsize in the range
        [10^-50, self.sampler.dt_max]. 
        
        Args:
            walkers : This MUST be an array or list of NewWalker class objects. These are NOT updated
                by this method.
            message_prefix (str/None) : if message_prefix is not None, then a message is printed
                describing the change in dt. If message_prefix is None, then no message is printed.
            adjust_step_factor (float) : self.sampler.dt is updated by * or / by this value.

        Return:
            duration (float) : duration of call to set in seconds. Can be useful for checking the 
                fraction of time spent updating step lengths.

        """
        start_time = time.time()
        if (self.nproc > 1):
            set_pool = ProcessPool(nodes=self.nproc)
        else:
            set_pool = None

        steplength_store = self.sampler.dt
        steplength_in = self.sampler.dt
        # protects against possible future bugs that would be hard to detect

        walk_n_walkers = int(self.nproc * np.ceil(float(self.min_num_data_point)/self.nproc))
        # rounds up to next multiple of self.nproc for maximum usage of compute

        walkers_clone = copy.deepcopy(walkers)  # expensive, but prevents this routine overwriting 
                                                # walkers

        first_time = True # we will make at least two tries. Logical flag ensures this.

        # Step size calibration loop:
        while True:

            # collect statistics on trajectory acceptance rate
            run_outputs = apply_pool(set_pool, self.run, np.random.choice(walkers_clone, \
                size=walk_n_walkers))
            results = map(itemgetter(1), run_outputs)
            del run_outputs

            # The total number of accepted/rejected moves for this step size
            rate = float(np.sum(results))/walk_n_walkers

            if (rate>=self.sampler.min_rate and rate<=self.sampler.max_rate):
                # If the total acceptance rate is within the desired range, return this stepsize
                self.print_dt_change(steplength_in, self.sampler.dt, message_prefix)
                break
            else: # update the stepsize to get closer to the desired range
                if( not first_time ): # dodge this the first time round - no rate_store saved yet
                    # Check whether rate and rate_store are on different sides 
                    # of interval
                    if ((min(rate,rate_store) < self.sampler.min_rate) and (max(rate,rate_store) > self.sampler.max_rate)):
                        # We previously obtained an acceptance rate on one side of the desired range 
                        # and now find an acceptance rate on the other side. We return the step size 
                        # that gave an acceptance rate closest to the middle of the desired range.

                        target = 0.5*(self.sampler.min_rate+self.sampler.max_rate) # middle of range
                        if (abs(rate-target)<abs(rate_store-target)):
                            # take current step length
                            self.print_dt_change(steplength_in, self.sampler.dt, \
                                message_prefix)
                            break
                        else:
                            # take saved step length
                            self.sampler.dt = steplength_store
                            rate = rate_store
                            self.print_dt_change(steplength_in, self.sampler.dt, \
                                message_prefix)
                            break

                else: # this is the fist time - no rate_store saved yet
                    first_time = False

                # save current step length and acceptance rate
                steplength_store = self.sampler.dt
                rate_store = rate

                # update step length
                if rate < self.sampler.min_rate:
                    exp = 1.0
                elif rate >= self.sampler.max_rate:
                    exp = -1.0

                # try to adjust
                self.sampler.dt *= adjust_step_factor**exp

                # Check that step size is neither larger than max allowed value nor smaller than 
                # 10^-50 (useful for detecting errors).
                # Error check:
                if (self.sampler.dt < 1.0e-50):
                    if (message_prefix is not None):
                        prfx = message_prefix + " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \
                        (self.sampler.dt)
                    else:
                        prfx = " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \
                        (self.sampler.dt)

                    exit_error(prfx, 25)

                # sampling demands a step size larger than dt_max. Set to dt_max then break
                if (self.sampler.dt>self.sampler.dt_max):
                    self.sampler.dt = self.sampler.dt_max
                    self.print_dt_change(steplength_in, self.sampler.dt, \
                        message_prefix)
                    break

        # close pool
        if (set_pool is not None):
            set_pool.terminate()
            set_pool.restart()

        end_time = time.time()
        duration = end_time - start_time
        return duration