Esempio n. 1
0
def cv_confusion_matrix(clf,
                        X,
                        y,
                        data_file,
                        pos_class=None,
                        folds=50,
                        verbose=False):
    skf = StratifiedKFold(n_splits=folds)
    cv_iter = skf.split(X, y)
    cms = []
    cluster = dispy.SharedJobCluster(wrapper,
                                     depends=[data_file],
                                     reentrant=True,
                                     setup=functools.partial(
                                         setup, basename(data_file)),
                                     cleanup=cleanup,
                                     scheduler_node='pomar.aic.uniovi.es',
                                     loglevel=logging.ERROR)
    try:
        jobs = []
        for train, test in cv_iter:
            job = cluster.submit(clone(clf), train, test, pos_class)
            jobs.append(job)
        for job in jobs:
            job()
            if job.exception:
                raise ClusterException(job.exception + job.ip_addr)
            cms.append(job.result)
    except KeyboardInterrupt:
        cluster.close()
    if verbose:
        cluster.print_status()
    cluster.close()
    return np.array(cms)
    def _startCluster(self, secret):
        ''' 
        Start dispy cluster

        @param secret: Password for dispy nodes
        '''

        import dispy
        
        # Amazon run function
        def amazon_run(data_fetcher, stage_containers, shared_lock=None, run_id=-1, verbose=False):
            global amazon_lock
            import time
            from skdiscovery.utilities.cloud.ssh_reverse import print_verbose

            if data_fetcher.multirun_enabled() == False:
                with amazon_lock:
                    print_verbose('ID: {}; Entering lock at {}'.format(run_id, time.time()), verbose)
                    data_container = data_fetcher.output()
                    print_verbose('ID: {}; Exiting lock at {}'.format(run_id, time.time()), verbose)

            else:
                data_container = data_fetcher.output()

            data_container.run_id = run_id
            for s in stage_containers:
                s.run(data_container)

            return data_container.getResults()

        self.__cluster = dispy.SharedJobCluster(amazon_run, secret=secret, port=0,
                                                scheduler_node='127.0.0.1',
                                                ip_addr='127.0.0.1', setup=_setupNode)
Esempio n. 3
0
    def _parallel_fit(self, X, y, verbose):
        def setup(data_file):
            global X, y
            import numpy as np
            with open(data_file, 'rb') as fh:
                data = np.load(fh)
                X = data['X']
                y = data['y']
            return 0

        def cleanup():
            global X, y
            del X, y

        def wrapper(qnf, n):

            X_sample = X[n]
            y_sample = y[n]
            return qnf._fit_and_get_distributions(X_sample, y_sample, True)

        cluster = dispy.SharedJobCluster(
            wrapper,
            depends=[self.X_y_path_],
            reentrant=True,
            setup=functools.partial(setup, basename(self.X_y_path_)),
            cleanup=cleanup,
            scheduler_node='dhcp015.aic.uniovi.es',
            loglevel=logging.ERROR)
        try:
            jobs = []
            for n in range(len(y)):
                job = cluster.submit(self, n)
                job.id = n
                jobs.append(job)
            for job in jobs:
                job()
                if job.exception:
                    raise ClusterException(job.exception + job.ip_addr)
                self.qnfs[job.id], classes = deepcopy(job.result)
                for cls in classes:
                    self.cls_smp_[cls].append(job.id)

        except KeyboardInterrupt:
            cluster.close()
        if verbose:
            cluster.print_status()
        cluster.close()
    def _startCluster(self, secret):
        ''' 
        Start dispy cluster

        @param secret: Password for dispy nodes
        '''

        import dispy

        # Amazon run function
        def amazon_run(data_fetcher, stage_containers, run_id=-1):
            data_container = data_fetcher.output()
            data_container.run_id = run_id
            for s in stage_containers:
                s.run(data_container)

            return data_container.getResults()

        self.__cluster = dispy.SharedJobCluster(amazon_run,
                                                secret=secret,
                                                port=0)
def distributed_nbr_events_detected(velocities_lab_list):

    det_list = [det1_circle,det2_circle,det3_circle]

    cluster = dispy.SharedJobCluster(nbr_events_detected, nodes=NODES,
    depends=[is_detected,is_part_detected,detection_info,velocity_line_point_at_z],
    setup=setup_node_nbr_events_detected, port=port)#, ip_addr=ip_addr)

    nbr_of_jobs = NBR_OF_CPUs*3
    nbr_list_el_per_job = math.ceil(len(velocities_lab_list)/float(nbr_of_jobs))

    jobs = []
    for i in range(0, nbr_of_jobs):
        list_start_idx = i*nbr_list_el_per_job
        list_end_idx_plus_one = min((i+1)*nbr_list_el_per_job, len(velocities_lab_list))
        sub_list = velocities_lab_list[list_start_idx:list_end_idx_plus_one]
        velocities_lab_idx = np.arange(list_start_idx, list_end_idx_plus_one)
        if len(velocities_lab_idx) != 0:
            nbr_events_detected_params = (sub_list, velocities_lab_idx, det_list)
            job = cluster.submit(nbr_events_detected_params)
            jobs.append(job)

    total_count = 0
    velocities_lab_idx = {}
    det_info_list = []
    for job in jobs:
        (local_count, velocities_lab_idx_sublist, det_info_sublist) = job()

        for detected_velocities_lab_idx in velocities_lab_idx_sublist:
            velocities_lab_idx[detected_velocities_lab_idx] = True

        det_info_list = itertools.chain(det_info_list,
        det_info_sublist)
        total_count = total_count+local_count
    cluster.close()
    return (total_count, velocities_lab_idx, list(det_info_list))
Esempio n. 6
0
        return np.array(results)


if __name__ == '__main__':

    def compute(n):
        import time, socket, pickle
        time.sleep(n)
        host = socket.gethostname()

        return (host, n)

    import dispy, random

    # distribute 'compute' to nodes; 'compute' does not have any dependencies (needed from client)
    cluster = dispy.SharedJobCluster(compute,
                                     scheduler_node="dhcp015.aic.uniovi.es")
    # run 'compute' with 20 random numbers on available CPUs
    jobs = []
    for i in range(20):
        job = cluster.submit(random.randint(10, 20))
        job.id = i  # associate an ID to identify jobs (if needed later)
        jobs.append(job)
    # cluster.wait() # waits until all jobs finish
    for job in jobs:
        host, n = job()  # waits for job to finish and returns results
        print('%s executed job %s at %s with %s and %s' %
              (host, job.id, job.start_time, n))
        # other fields of 'job' that may be useful:
        # job.stdout, job.stderr, job.exception, job.ip_addr, job.end_time
    cluster.print_status()  # shows which nodes executed how many jobs etc.
Esempio n. 7
0
    model = GaussianNB()
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    from sklearn.metrics import accuracy_score
    return accuracy_score(ytest, y_model)


if __name__ == '__main__':
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_digits
    digits = load_digits()
    import dispy, logging
    import os

    # assume nodes node1 and node2 have 'doc1', 'doc2' etc. on their
    # local storage, so no need to transfer them
    map_cluster = dispy.SharedJobCluster(mapper,
                                         nodes=['*'],
                                         scheduler_node="172.20.10.6")
    #map_cluster = dispy.JobCluster(mapper)

    # any node can work on reduce
    jobs = []
    job1 = map_cluster.submit(digits)
    jobs.append(job1)
    job2 = map_cluster.submit(digits)
    jobs.append(job2)
    for i in jobs:
        integer = i()
        print(integer)
def distributed_events_gen(nbr_events_per_ker, local_kers):

    mass = deuterium_mass

    cluster = dispy.SharedJobCluster(gen_valid_events, nodes=NODES,
    depends=[gen_velocity, comp_velocities_lab_list],
    setup=setup_gen_valid_events, port=port)#, ip_addr=ip_addr)

    jobs = []

    #Distribute workload among nodes:
    # - if one ker: distribute over possible kinetic energies generation
    # - if many kers: one ker per node

    if len(local_kers) == 1:
        nbr_of_jobs = min(NBR_OF_CPUs*3, nbr_events_per_ker)
        local_kers = np.ones(nbr_of_jobs)*local_kers[0]

        effective_nbr_events_per_ker = nbr_events_per_ker/nbr_of_jobs* \
        len(local_kers)

        for ker in local_kers:
            gen_valid_events_params = (ker, mass,
            nbr_events_per_ker/nbr_of_jobs, V_cm, speed_to_SI_cm)
            job = cluster.submit(gen_valid_events_params)
            jobs.append(job)
    else:
        effective_nbr_events_per_ker = nbr_events_per_ker
        for ker in local_kers:
            gen_valid_events_params = (ker, mass,
            nbr_events_per_ker, V_cm, speed_to_SI_cm)
            job = cluster.submit(gen_valid_events_params)
            jobs.append(job)

    #Retrieve results
    kin_energies_list = []
    velocities_cm_list = []
    velocities_lab_list = []
    ker_list = []

    job_idx = 0
    for job in jobs:
        job_res = job()
        if job_res is not None:
            (kin_energies_sublist, velocities_cm_sublist, velocities_lab_sublist) = job_res
            kin_energies_list = itertools.chain(kin_energies_list,
            kin_energies_sublist)
            velocities_cm_list = itertools.chain(velocities_cm_list,
            velocities_cm_sublist)
            velocities_lab_list = itertools.chain(velocities_lab_list,
            velocities_lab_sublist)

            ker_sublist = list(np.ones(len(kin_energies_sublist))*local_kers[job_idx])
            ker_list = itertools.chain(ker_list, ker_sublist)
            job_idx = job_idx+1

    cluster.close()
    kin_energies_list = list(kin_energies_list)
    velocities_cm_list = list(velocities_cm_list)
    velocities_lab_list = list(velocities_lab_list)
    ker_list = list(ker_list)
    return (ker_list, kin_energies_list, velocities_cm_list, velocities_lab_list,
    effective_nbr_events_per_ker)
Esempio n. 9
0
def compute_cluster(algorithms, nodes, start_scheduler=True):
    """
    Computes Algorithm instances on compute cluster composed of nodes. The first node is also the dispy-scheduler.
    Afterwards, the dispy-nodes are started on every node. On every node, screen sessions are started with the names
    "scheduler" and "node", where the scheduler and the nodes are residing, respectively.
    They can be accessed by "screen -rD scheduler" or "screen -rD node" when connected via ssh to the machines.

    Parameters
    ----------
    algorithms : list of Algorithm instances
        Algorithm instances initialized with different gPC problems and/or models
    nodes : str or list of str
        Node names
    start_scheduler : bool
        Starts a scheduler on the first machine in the nodes list or not. Set this to False if a scheduler is already
        running somewhere on the cluster.
    """
    def _algorithm_run(f):
        f.run

    dispy.MsgTimeout = 90

    for n in nodes:
        # screen/dispy output will be send to devnull, to keep the terminal window clean
        with open(os.devnull, 'w') as f:

            # get PIDs for old scheduler and node screens and kill them
            regexp_pid = "\t(\d*)."  # after \tab, get digits until '.'

            for name in ["scheduler", "node"]:
                # get screen -list output for correct screen, which also has the pid
                stdout, stderr = subprocess.Popen(
                    ['ssh', n, 'screen -list | grep {}'.format(name)],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE).communicate()
                subprocess.Popen(['ssh', n, 'screen', "-wipe"]).communicate()
                try:
                    pid = re.search(
                        regexp_pid,
                        stdout).group(0)[:-1]  # remove last char (.)
                    subprocess.Popen(['ssh', n, 'kill', pid]).communicate()

                except AttributeError:
                    # no 'scheduler' or 'node' screen session found on host
                    pass

            # start scheduler on first node
            if start_scheduler:
                print("Starting dispy scheduler on " + n)

                # subprocess.Popen("ssh -tt " + n + " screen -R scheduler -d -m python "
                #                  + os.path.join(dispy.__path__[0], "dispyscheduler.py &"), shell=True)

                # ssh -tt: pseudo terminal allocation
                #
                # screen
                #        -R scheduler: Reconnect or create session with name scheduler
                #        -d detach (is it needed?)
                #        -m "ignore $STY variable, do create a new screen session" ??
                #
                # subprocess
                #        -shell: False. If True, opens new shell and does not return
                #                If true, do not use [] argument passing style.
                #        -stdout: devnull. Pipe leads to flooded terminal.
                #
                # "export", "TERM=screen", "&&",
                #
                subprocess.Popen([
                    "ssh", "-tt", n, "screen", "-dmS", "scheduler", "python " +
                    os.path.join(dispy.__path__[0], "dispyscheduler.py")
                ],
                                 shell=False,
                                 stdout=f)
                time.sleep(5)

            print("Starting dispy node on " + n)
            subprocess.Popen([
                "ssh", "-tt", n, "screen", "-dmS", "node", "python " +
                os.path.join(dispy.__path__[0], "dispynode.py --clean")
            ],
                             shell=False,
                             stdout=f)
            time.sleep(5)

    cluster = dispy.SharedJobCluster(_algorithm_run,
                                     scheduler_node=nodes[0],
                                     reentrant=True,
                                     port=0)

    time.sleep(5)

    # build job list and start computations
    jobs = []
    for a in algorithms:
        job = cluster.submit(a)
        job.id = a
        jobs.append(job)

    # wait until cluster finished the computations
    cluster.wait()
Esempio n. 10
0
def func():
    # put reader and map function are combined
    words = ['боль', 'страдания']
    word = words[random.randint(1)]
    print(word)
    return word


if __name__ == '__main__':
    import dispy, logging
    # assume nodes node1 and node2 have 'doc1', 'doc2' etc. on their
    # local storage, so no need to transfer them
    cluster = dispy.SharedJobCluster(computation=func,
                                     nodes=['*'],
                                     scheduler_node='192.168.43.181')
    job = cluster.submit()
    word = job()
    print(word)
    print('worked')
Esempio n. 11
0
def clean(s):
    '''working with data'''
    translator = str.maketrans("", "", string.punctuation)
    return s.translate(translator)


if __name__ == '__main__':
    import dispy, logging
    import math
    import csv
    import string
    # bayes
    with open(
            "/home/hei_damn/Documents/python_homeworks/cluster/data/SMSSpamCollection"
    ) as f:  # change route!
        data = list(csv.reader(f, delimiter="\t"))
    X, y = [], []
    for target, msg in data:
        X.append(msg)
        y.append(target)
        X = [clean(x).lower() for x in X]
    X_train, y_train, X_test, y_test = X[:390], y[:390], X[390:500], y[390:500]
    # cluster. You should choose JobCluster or SharedJobCluster
    # cluster = dispy.JobCluster(bayes_comp)
    cluster = dispy.SharedJobCluster(computation=bayes_comp,
                                     scheduler_node='192.168.43.54',
                                     depends=[sklearn])  # change IP!
    job = cluster.submit(X_train, y_train, X_test, y_test)
    result = job()
    print(result)
Esempio n. 12
0
def func():
    return 'sdsgjf'


if __name__ == "__main__":
    import dispy, logging
    cluster = dispy.SharedJobCluster(computation=func,
                                     scheduler_node='192.168.43.37')
    job = cluster.submit()
    word = job()
    print(word)
    print('worked')