Beispiel #1
0
def main():
    client = Client(n_workers=10, threads_per_worker=1)
    print(client)

    df = dask.datasets.timeseries(
        start="2000-01-01",
        end="2000-01-31",
        # end="2000-12-31",
        partition_freq="1h",
        freq="60s",
    )
    df = df.persist()
    wait(df)
    iterations = 10

    with performance_report(filename=f"{today}-simple-scheduler.html"):
        simple = []
        # print('start simple: ', flush=True)
        for i in range(iterations):
            start = time.time()
            z = df.x + 1 + 2 - df.y
            z.sum().compute()
            stop = time.time()
            simple.append(stop - start)
        simple = np.array(simple)

    df2 = None
    with performance_report(filename=f"{today}-shuffle-scheduler.html"):
        shuffle_t = []
        # print('start shuffle: ', flush=True)
        for i in range(iterations):
            client.cancel(df2)
            start = time.time()
            # shuffle(df, "id", shuffle="tasks")
            df2 = df.set_index("id").persist()
            wait(df2)
            stop = time.time()
            shuffle_t.append(stop - start)
        shuffle_t = np.array(shuffle_t)

    with performance_report(filename=f"{today}-rand-access-scheduler.html"):
        rand_access = []
        for i in range(iterations):
            start = time.time()
            df2.head()
            stop = time.time()
            rand_access.append(stop - start)
        rand_access = np.array(rand_access)
    data = dsa.random.random((10000, 1000000), chunks=(1, 1000000))
    da = xr.DataArray(data,
                      dims=['time', 'x'],
                      coords={'day': ('time', np.arange(10000) % 100)})
    clim = da.groupby('day').mean(dim='time')
    anom = da.groupby('day') - clim
    anom_mean = anom.mean(dim='time')
    with performance_report(filename=f"{today}-anom-mean-scheduler.html"):
        anom_mean_t = []
        for i in range(iterations):
            start = time.time()
            anom_mean.compute()
            stop = time.time()
            anom_mean_t.append(stop - start)

        anom_mean_t = np.array(anom_mean_t)

    return dict(simple=simple,
                shuffle=shuffle_t,
                rand_access=rand_access,
                anom_mean=anom_mean_t)
Beispiel #2
0
class Fask:
    def __init__(self, **kwa):
        cfg = kwa.get('cfg')

        loglevel = dict(
            debug=logging.DEBUG,
            info=logging.INFO,
            warn=logging.WARN,
            error=logging.ERROR,
        ).get(
            cfg.get('loglevel'),
            logging.INFO,
        )

        self.reset()

        handler = logging.FileHandler(
            '%s/../log/fask.log' % os.path.dirname(os.path.realpath(__file__)))
        handler.setFormatter(
            logging.Formatter(
                fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
        self.logger = logging.getLogger('fask')
        self.logger.addHandler(handler)
        self.logger.setLevel(loglevel)

        self.cluster = LocalCluster(
            n_workers=cfg.get('processes'),
            processes=True,
            threads_per_worker=cfg.get('threads'),

            # make bokeh available outside of a docker container too
            # see: https://github.com/dask/distributed/issues/1875
            # well, that was a fun two hours (for moderate values of "fun" ;-)
            ip='',
        )

        self.client = Client(self.cluster)
        self.setup_signals(timeout=cfg.get('timeout'))

        self.logger.info('ready')

        # signalling fask to stop (SIGINT, SIGALRM) will raise SystemExit
        #
        try:
            self.run()
        except SystemExit:
            self.logger.info('done')

    ###
    ### sysprog

    def log_status(self, caller='xxx'):
        self.logger.info(
            '[status at {caller}] F {futures} | dF {done_futures} | cF {cancelled_future}| !cdF {not_cancelled_and_done} | CS {cs} | RC {rc}'
            .format(
                caller=caller,
                futures=len(self.futures),
                done_futures=len(
                    list(filter(lambda f: f.done() == True, self.futures))),
                cancelled_future=len(
                    list(filter(lambda f: f.cancelled() == True,
                                self.futures))),
                not_cancelled_and_done=len(
                    list(
                        filter(
                            lambda f: f.cancelled() == False and f.done() ==
                            True, self.futures))),
                cs=self.calculations_submitted,
                rc=self.results_collected,
            ))

    def cleanup(self):
        self.logger.info('cleaning up')
        self.log_status('cleanup')

        self.collect_results(all=False)

        # Client.cancel()
        # This stops future tasks from being scheduled if they have not yet
        # run and deletes them if they have already run. After calling, this
        # result and all dependent results will no longer be accessible

        self.logger.info('cancel all futures')

        # XXX
        # cancelling a future also markes it done
        self.client.cancel(self.futures)

    def setup_signals(self, **kwa):
        signal.signal(signal.SIGINT, self.handler_sigint)

        if kwa.get('timeout'):
            signal.signal(signal.SIGALRM, self.handler_sigalrm)
            signal.alarm(kwa.get('timeout'))

    def handler_sigint(self, signum, frame):
        self.logger.warning('exit because of SIGINT')
        self.cleanup()
        self.bailout()

    def handler_sigalrm(self, signum, frame):
        self.logger.warning('exit because of SIGALRM')
        self.cleanup()
        self.bailout()

    def bailout(self):
        self.logger.warning('bailing out')
        self.log_status('bailout')
        sys.exit(0)

    def reset(self):
        self.calculations_submitted = 0
        self.results_collected = 0
        self.results = []
        self.futures = []

    ###
    ### worker

    def run(self):
        """ submit all given calculations
        """
        self.reset()

        if not len(self.calculations()):
            raise (LookupError, 'no calculations available')

        for ci, c in enumerate(self.calculations()):
            future = self.client.submit(c, pure=False)

            self.logger.debug('[{ci}] future {key} submitted'.format(
                ci=ci, key=future.key))
            self.futures.append(future)
            self.calculations_submitted += 1

        self.log_status('run')
        self.collect_results(all=True)

    def collect_results(self, **kwa):
        """ collect (and log) results as they become available
            (this will block)
        """

        if kwa.get('all'):
            self.logger.info('collect all results')
            futures = as_completed(self.futures)
        else:
            self.logger.info('collect already done results only')
            futures = filter(lambda f: f.done() == True, self.futures)

        # for xi, future in enumerate (as_completed (self.futures)):
        for xi, future in enumerate(futures):
            self.results_collected += 1
            result = future.result()
            key = future.key
            # future.cancel()

            self.logger.debug('[{xi}] future {key} yielded {result}'.format(
                xi=xi, key=key, result=result))

            self.results.append(dict(
                index=xi,
                result=result,
            ))

        self.log_status('collect_results')

    def calculations(self):
        """ overwrite this virtual method
            this is where your actual code goes

            OUT: a list of functions to run
        """

        raise NotImplementedError('virtual method run() not implemented')
def beta_parallel_disk_detection(dataset, 
                            probe,
                            #rxmin=None, # these would allow selecting a sub section 
                            #rxmax=None,
                            #rymin=None,
                            #rymax=None,
                            #qxmin=None,
                            #qxmax=None,
                            #qymin=None,
                            #qymax=None,
                            probe_type="FT",
                            dask_client= None,
                            dask_client_params:dict=None,
                            restart_dask_client=True,
                            close_dask_client=False,
                            return_dask_client=True,
                            *args, **kwargs):
    """
    This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. 

    This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets 
    
    There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. 
    If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params.
    If no dask_client arguments are passed it will create a dask_client for a local machine 
    
    Note:
        Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash.
    Args:
        dataset (py4dSTEM datacube): 4DSTEM dataset
        probe (ndarray): can be regular probe kernel or fourier transormed
        probe_type (str): "FT" or None 
        dask_client (distributed.client.Client): dask client
        dask_client_params (dict): parameters to pass to dask client or dask cluster
        restart_dask_client (bool): if True, function will attempt to restart the dask_client.
        close_dask_client (bool): if True, function will attempt to close the dask_client.
        return_dask_client (bool): if True, function will return the dask_client.
        *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary...

    Returns:
        peaks (PointListArray): the Bragg peak positions and the correlenation intensities
        dask_client(optional) (distributed.client.Client): dask_client for use later.
    """
    #TODO add asserts abotu peaks not being passed
    # Dask Client stuff
    #TODO how to guess at default params for client, sqrt no.cores.  Something to do with the size of the diffraction patterm
    # write a function which can do this. 
    #TODO replace dask part with a with statement for easier clean up e.g.
    # with LocalCluser(params) as cluster, Client(cluster) as client: 
    #   ... dask stuff. 
    #TODO add assert statements and other checks. Think about reordering opperations
    
    if dask_client == None: 
        if dask_client_params !=None:

            dask.config.set({'distributed.worker.memory.spill': False,
                'distributed.worker.memory.target': False}) 
            cluster = LocalCluster(**dask_client_params)
            dask_client = Client(cluster, **dask_client_params)
        else:
            # AUTO MAGICALLY SET?
            # LET DASK SET?
            # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE?
            # psutil could be used to count cores. 
            dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk
                'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out
            cluster = LocalCluster()
            dask_client = Client(cluster)

    else:
        assert type(dask_client) == distributed.client.Client
        if restart_dask_client:
            try:
                dask_client.restart()
            except Exception as e:
                print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT
                return e 
        else:
            pass


    # Probe stuff
    assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched"
    if probe_type != "FT":
    #TODO clean up and pull out redudant parts
    #if probe.dtype != (np.complex128 or np.complex64 or np.complex256):
        #DO FFT SHIFT THING
        probe_kernel_FT = np.conj(np.fft.fft2(probe))
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()
        # delayed_probe_kernel_FT = delayed(probe_kernel_FT)
    else:
        probe_kernel_FT = probe
        dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny))
        dask_probe_delayed = dask_probe_array.to_delayed()

    # GET DATA 
    #TODO add another elif if it is a dask array then pass
    if type(dataset.data) == np.ndarray:
        dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    elif dataset.stack_pointer != None:
        dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny))
    else: 
        print("Couldn't access the data")
        return None

    # Convert the data to delayed 
    dataset_delayed = dask_data.to_delayed()
    # TODO Trim data e.g. rx,ry,qx,qy
    # I can pass the index values in here I should trim the probe and diffraction pattern first


    # Into the meat of the function 
    
    # create an empty list to which we will append the dealyed functions to. 
    res = []
    # loop over the dataset_delayed and create a delayed function of 
    for x in np.ndindex(dataset_delayed.shape):
        temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x],
                                probe_kernel_FT=dask_probe_delayed[0,0],
                                #probe_kernel_FT=delayed_probe_kernel_FT,
                                *args, **kwargs) #passing through args from earlier or should I use 
                                #corrPower=corrPower,
                                #sigma=sigma_gaussianFilter,
                                #edgeBoundary=edgeBoundary,
                                #minRelativeIntensity=minRelativeIntensity,
                                #minPeakSpacing=minPeakSpacing,        
                                #maxNumPeaks=maxNumPeaks,
                                #subpixel='poly')
        res.append(temp)
    _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing 

    output = dask_client.gather(_temp_peaks) # gather the future objects 

    coords = [('qx',float),('qy',float),('intensity',float)]
    peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2])

    #temp_peaks[0][0]

    # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry),
    for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])):
        #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count])
        #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0])
        peaks.get_pointlist(rx, ry).add_pointlist(output[count])

    # Clean up
    dask_client.cancel(_temp_peaks) # removes from the dask workers
    del _temp_peaks # deletes the object 
    if close_dask_client:
        dask_client.close()
        return peaks
    elif close_dask_client == False and return_dask_client == True:
        return peaks, dask_client
    elif close_dask_client and return_dask_client == False:
        return peaks
    else:
        print('Dask Client in unknown state, this may result in unpredicitable behaviour later')
        return peaks
from dask.datasets import timeseries
import time
from dask.dataframe.shuffle import shuffle
from dask.distributed import Client, wait

if __name__ == "__main__":
    client = Client("127.0.0.1:8786")
    ddf_h = timeseries(start='2000-01-01',
                       end='2000-01-02',
                       partition_freq='1min')
    for i in range(5):
        print("Iteration: ", i)
        result = shuffle(ddf_h, "id", shuffle="tasks")
        ddf = client.persist(result)
        _ = wait(ddf)
        client.cancel(ddf)
        client.cancel(result)
    client.shutdown()
    time.sleep(0.5)
Beispiel #5
0
def main(vocabSize, tokenFile, outputFileName):

    #### CREATING VOCAB

    wordCount = pickle.load(open('wordCount', 'rb'))

    vocab = wordCount.most_common(vocabSize)

    ## Creating the Word-ID dictionaries
    id_to_word = {i: x[0] for i, x in enumerate(vocab)}

    word_to_id = {value: key for key, value in id_to_word.items()}

    wordSet = set(word_to_id.keys())

    #### DASK PROCESS

    client = Client()

    print(client)

    def createCMatrix(corpus):

        windowSize = 10

        cooccurrences = sparse.lil_matrix((vocabSize, vocabSize),
                                          dtype=np.float64)

        for doc in corpus:

            for center_index, center_word in enumerate(doc):

                if center_word not in wordSet:
                    continue

                context = doc[max(0, center_index - windowSize):center_index]
                contextLen = len(context)

                for context_index, context_word in enumerate(context):

                    dist = contextLen - context_index

                    inc = 1.0 / float(dist)

                    if context_word in wordSet:

                        cooccurrences[word_to_id[center_word],
                                      word_to_id[context_word]] += inc
                        cooccurrences[word_to_id[context_word],
                                      word_to_id[center_word]] += inc

                        # center_id = word_to_id[center_word]
                        # context_id = word_to_id[context_word]

                        # if center_id<context_id:

                        #     cooccurrences[center_id, context_id] += inc
                        # else:
                        #     cooccurrences[context_id, center_id] += inc

        return cooccurrences

    def split(a, n):
        k, m = divmod(len(a), n)
        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
                for i in range(n))

    corpus = pickle.load(open(tokenFile, 'rb'))

    matrices = []

    print("Starting process")

    for sub in tqdm(split(corpus, 10)):

        a = client.map(createCMatrix, list(split(sub, 16)))
        b = client.gather(a)

        mat = reduce(lambda x, y: x + y, b)

        matrices.append(mat.copy())

        client.cancel(a)
        client.cancel(b)

        del a
        del b

    print("Creating Final Cooccurence matrix")

    finalMat = reduce(lambda x, y: x + y, matrices)

    with open(outputFileName, 'wb') as f:
        pickle.dump(finalMat, f)

    client.shutdown()
Beispiel #6
0
class Remote(object):
    """
    Remote.
    
    Args:
        address (str): Remote scheduler address formed by `ip:port`.
        tls_ca_file (str, optional): TLS CA certificate file path. Defaults to None.
        tls_client_cert (str, optional): TLS certificate file path. Defaults to None.
        tls_client_key (str, optional): TLS private key file path. Defaults to None.
        require_encryption (bool, optional): Encrypt data exchange. Defaults to False.
        
    Note:
        TLS will be enabled only if all three TLS arguments are provided. 
        Remember to change network protocol to `tls://<address>`.
    """
    def __init__(self, address: str,
                 tls_ca_file: str = None, tls_client_cert: str = None, tls_client_key: str = None,
                 require_encryption: bool = False):
        # authentication
        sec = None
        if tls_ca_file and tls_client_cert and tls_client_key:
            sec = Security(tls_ca_file=tls_ca_file,
                           tls_client_cert=tls_client_cert,
                           tls_client_key=tls_client_key,
                           require_encryption=require_encryption)

        # init
        self._client = Client(address=address, security=sec)
        self._client.register_worker_callbacks(Remote._worker_startup)

    @staticmethod
    def _worker_startup(dask_worker: Worker):
        os.chdir(dask_worker.local_dir)

    def add_dependencies(self, files):
        """
        Add list of dependencies, order matters.
        
        Args:
            files (list): List of dependent files.
        """
        # TODO: automatically resolve module dependencies
        if isinstance(files, str):
            files = [files]
        for f in files:
            self._client.upload_file(f)

    def scatter(self, *args, **kwargs):
        """
        Scatter data.
        """
        return self._client.scatter(*args, **kwargs)

    def submit(self, func, *args, **kwargs):
        """
        Submit function and data.
        
        Args:
            func (callable): User function.
        """
        return self._client.submit(func, *args, **kwargs)

    def fetch(self, futures_, **kwargs):
        """
        Fetch data of future objects.
        
        Args:
            futures_ (list): Future objects.
        """
        return self._client.gather(futures_, **kwargs)

    def cancel(self, futures_, **kwargs):
        """
        Cancel job of future objects.
        
        Args:
            futures_ (list): Future objects.
        """
        return self._client.cancel(futures_, **kwargs)

    def close(self, *args, **kwargs):
        """
        Close connection.
        """
        return self._client.close(*args, **kwargs)
Beispiel #7
0
    def qPCR_performance(self, deletions = 0, insertions = 0, substitutions = 0,
                         fname = 'pyprimer_benchmark.feather', csv_fname = "pyprimer_summary.csv",):
        def generate_group_summary(group_df, group, col_list):
            v_stats = dict((key,[]) for key in col_list)
            for fversion in group_df["F Primer Version"].unique():
                for rversion in group_df["R Primer Version"].unique():
                    for pversion in group_df["P Probe Version"].unique():
                        mean_ppc = group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion), "PPC"].mean()
                        seqs_matched = len(group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion) & (group_df["Amplicon Sense Length"] != 0), "Amplicon Sense Length"])
                        n_seqs = group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion), :].shape[0]
                        v_stats["Primer Group"].append(group)
                        v_stats["F Version"].append(fversion)
                        v_stats["P Version"].append(pversion)
                        v_stats["R Version"].append(rversion)
                        v_stats["Mean PPC"].append(mean_ppc)
                        try:
                            percent_matched = (seqs_matched / n_seqs)*100
                        except:
                            percent_matched = 0
                        v_stats["Sequences matched(%)"].append(percent_matched)
            group_stats = pd.DataFrame(v_stats)
            return group_stats

        def analyse(sequences_path, Fs, Rs, Ps, col_list, deletions, insertions, substitutions):
            res = []
            with open(sequences_path, "r", newline='') as csvfile:
                seqreader = csv.reader(csvfile, delimiter = ',', quotechar ='"')
                for sequences in seqreader:
                    if sequences[0] == "Header":
                        pass
                    else:
                        for f in Fs:
                            for r in Rs:
                                for p in Ps:
                                    header = sequences[0]
                                    f_name = f[2]
                                    f_ver = f[5]
                                    p_ver = p[5]
                                    r_ver = r[5]
                                    f_res = TOOLS.match_fuzzily(f_ver, sequences[1], deletions, insertions, substitutions)
                                    r_res = TOOLS.match_fuzzily(r_ver, sequences[2], deletions, insertions, substitutions)

                                    if (f_res == None) or (r_res == None):
                                        start = None
                                        end = None
                                        amplicon = ""
                                        amplicon_length = 0
                                        f_match = ""
                                        r_match = ""
                                        p_match = ""
                                        PPC = 0
                                    
                                    else:
                                        Forwards = {}
                                        if type(f_res) == type(tuple()):
                                            Forwards[0] = (f_res[0], f_ver, 0) # (start, match, distance)
                                        else:
                                            for f_i in range(len(f_res)):
                                                Forwards[f_i] = (f_res[f_i].start, f_res[f_i].matched, f_res[f_i].dist)
                                        Reverses = {}
                                        if type(r_res) == type(tuple()):
                                            Reverses[0] = (r_res[0], r_ver, 0)
                                        else:
                                            for r_i in range(len(r_res)):
                                                Reverses[r_i] = (r_res[r_i].start, r_res[r_i].matched, r_res[r_i].dist)
                                        matches = {}
                                        for k_f, v_f in Forwards.items():
                                            start = v_f[0]
                                            for k_r, v_r in Reverses.items():
                                                end = (len(sequences[1]) - 1) - v_r[0]
                                                if end < start:
                                                    matches[f"{k_f}:{k_r}"] = False
                                                amplicon = sequences[1][start:end]
                                                if len(amplicon) > 850:
                                                    matches[f"{k_f}:{k_r}"] = False
                                                else:
                                                    p_res = TOOLS.match_fuzzily(p_ver, amplicon, deletions, insertions, substitutions)
                                                    if p_res == None:
                                                        matches[f"{k_f}:{k_r}"] = False
                                                    else:
                                                        matches[f"{k_f}:{k_r}"] = True
                                        target_dist = np.Inf
                                        n_match = 0
                                        for k, v in matches.items():
                                            if v:
                                                n_match += 1
                                                klist = k.split(":")
                                                k_f = int(klist[0])
                                                k_r = int(klist[1])
                                                f_good = Forwards[k_f]
                                                r_good = Reverses[k_r]
                                                mean_dist = (f_good[2] + r_good[2] + 1e-6)/2 # 1e-6 for smoothing
                                                if mean_dist < target_dist:
                                                    target_dist = mean_dist
                                                    start = f_good[0]
                                                    f_match = f_good[1]
                                                    end = (len(sequences[1]) - 1) - r_good[0]
                                                    r_match = r_good[1]
                                                    amplicon = sequences[1][start:end]
                                                    amplicon_length = len(amplicon)
                                                    if amplicon_length > 850:
                                                        n_match -= 1
                                                        start = None
                                                        end = None
                                                        amplicon = ""
                                                        amplicon_length = 0
                                                        f_match = ""
                                                        r_match = ""
                                                        PPC = 0
                                        if n_match <= 0:
                                            start = None
                                            end = None
                                            amplicon = ""
                                            amplicon_length = 0
                                            f_match = ""
                                            r_match = ""
                                            PPC = 0
                                        else:
                                            PPC = TOOLS.calculate_PPC(F_primer=f_ver,
                                                                    F_match=f_match,
                                                                    R_primer=r_ver,
                                                                    R_match=r_match)

                                    res.append([f_name, f_ver, p_ver,
                                                r_ver, header, amplicon,
                                                amplicon_length, start, end, PPC])
            res_df = pd.DataFrame(res, columns = col_list)
            del res
            return res_df

        self.fname = fname
        self.csv_fname = csv_fname
        self.deletions = deletions
        self.insertions = insertions
        self.substitutions = substitutions

        unique_groups = self.primers["ID"].unique()
        summary = pd.DataFrame(columns = self.SUMMARY_qPCR_COL_LIST)
        os.makedirs(self.savedir, exist_ok = True)
        print("Running Benchmark")
        cluster = LocalCluster(n_workers = self.nCores, threads_per_worker = 4, silence_logs=logging.ERROR)
        client = Client(cluster, timeout = 120)

        for group in tqdm(unique_groups):
            def help_analyse(x):
                return analyse(x, Fs, Rs, Ps, self.BENCHMARK_qPCR_COL_LIST,
                            self.deletions, self.insertions, self.substitutions)
            Fs = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "F"),:].values
            Rs = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "R"),:].values
            Ps = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "P"),:].values
            print(f"Processing group {group}\n")
            futures = client.map(help_analyse, self.chunkpaths)
            progress(futures)
            result_chunks = client.gather(futures)
            group_df = pd.concat(result_chunks)
            group_df.reset_index(drop = True, inplace = True)
            print("\nPerformance computed, generating group summary\n")
            group_stats = generate_group_summary(group_df, group, self.SUMMARY_qPCR_COL_LIST)
            summary = summary.append(group_stats)
            client.cancel(futures)
            del group_stats
            del result_chunks
            print("Summary generated, saving group benchmark to Feather\n")
            group_df.to_feather(os.path.join(self.tmpdir, f"{group}_"+self.fname), compression = "uncompressed")
            print(f"Benchmark results saved to {os.path.join(self.tmpdir, group_+self.fname)}\n")
            del group_df

        summary.to_csv(os.path.join(self.savedir, self.csv_fname), index = False)
        print(f"Benchmark summary saved to {os.path.join(self.savedir, self.csv_fname)}\n")