Esempio n. 1
0
def exchange1d_nonblocking(cart1d, u, left, right):
    req = [MPI.Request() for k in range(4)]
    u_left, u_right, u_left_ghost, u_right_ghost = assign_aliases(u)
    req[0] = cart1d.Irecv(u_right_ghost, source=right)
    req[1] = cart1d.Irecv(u_left_ghost, source=left)
    req[2] = cart1d.Isend(u_left, dest=left)
    req[3] = cart1d.Isend(u_right, dest=right)
    return req
Esempio n. 2
0
    def spawn(self):
        """
        Spawn MPI processes for and execute each of the managed targets.
        """

        if self._is_parent:
            # Find the path to the mpi_backend.py script (which should be in the
            # same directory as this module:
            parent_dir = os.path.dirname(__file__)
            mpi_backend_path = os.path.join(parent_dir, 'mpi_backend.py')

            # Spawn processes:
            self._intercomm = MPI.COMM_SELF.Spawn(sys.executable,
                                                  args=[mpi_backend_path],
                                                  maxprocs=len(self))

            # First, transmit twiggy logging emitters to spawned processes so
            # that they can configure their logging facilities:
            for i in self._targets.keys():
                self._intercomm.send(twiggy.emitters, i)

            # Next, serialize the routing table ONCE and then transmit it to all
            # of the child nodes:
            try:
                routing_table = self.routing_table
            except:
                routing_table = RoutingTable()
                self.log_warning(
                    'Routing Table is null, using empty routing table.')

            self._intercomm.bcast(routing_table, root=MPI.ROOT)

            # Transmit class to instantiate, globals required by the class, and
            # the constructor arguments; the backend will wait to receive
            # them and then start running the targets on the appropriate nodes.
            req = MPI.Request()
            r_list = []
            for i in self._targets.keys():
                target_globals = all_global_vars(self._targets[i])

                # Serializing atexit with dill appears to fail in virtualenvs
                # sometimes if atexit._exithandlers contains an unserializable function:
                if 'atexit' in target_globals:
                    del target_globals['atexit']
                data = (self._targets[i], target_globals, self._kwargs[i])
                r_list.append(self._intercomm.isend(data, i))

                # Need to clobber data to prevent all_global_vars from
                # including it in its output:
                del data
            req.Waitall(r_list)
Esempio n. 3
0
    def wait(self):
        """
        Wait for execution to complete.
        """

        # Start listening for control messages:
        r_ctrl = []
        try:
            d = self.intercomm.irecv(source=MPI.ANY_SOURCE,
                                     tag=self._ctrl_tag)
        except TypeError:
            # irecv() in mpi4py 1.3.1 stable uses 'dest' instead of 'source':
            d = self.intercomm.irecv(dest=MPI.ANY_SOURCE,
                                     tag=self._ctrl_tag)
        r_ctrl.append(d)
        workers = range(len(self))
        req = MPI.Request()
        while True:
            # Check for control messages from workers:
            flag, msg_list = req.testall(r_ctrl)
            if flag:
                msg = msg_list[0]
                if msg[0] == 'done':
                    self.log_info('removing %s from worker list' % msg[1])
                    workers.remove(msg[1])

                # Additional control messages from the workers are processed
                # here:
                else:
                    self.process_worker_msg(msg)

                # Get new control messages:
                r_ctrl = []
                try:
                    d = self.intercomm.irecv(source=MPI.ANY_SOURCE,
                                             tag=self._ctrl_tag)
                except TypeError:
                    # irecv() in mpi4py 1.3.1 stable uses 'dest' instead of 'source':
                    d = self.intercomm.irecv(dest=MPI.ANY_SOURCE,
                                             tag=self._ctrl_tag)
                r_ctrl.append(d)

            if not workers:
                self.log_info('finished running manager')
                break
Esempio n. 4
0
def naive_linear_regression(n_procs, n_samples, n_features, input_dir,
                            n_stragglers, is_real_data, params, add_delay,
                            update_rule):

    assert update_rule in ('GD', 'AGD')

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    n_workers = n_procs - 1
    rounds = params[0]

    #beta=np.zeros(n_features)
    beta = np.random.randn(n_features)

    # Loading data on workers
    if (rank):

        if not is_real_data:
            X_current = load_data(input_dir + str(rank) + ".dat")
            y = load_data(input_dir + "label.dat")
        else:
            X_current = load_sparse_csr(input_dir + str(rank))
            y = load_data(input_dir + "label.dat")

        rows_per_worker = X_current.shape[0]
        y_current = y[(rank - 1) * rows_per_worker:rank * rows_per_worker]

    # Initializing relevant variables
    if (rank):

        predy = X_current.dot(beta)
        #g = -X_current.T.dot(np.divide(y_current,np.exp(np.multiply(predy,y_current))+1))
        g = -2 * X_current.T.dot(y_current - predy)
        send_req = MPI.Request()
        recv_reqs = []

    else:

        msgBuffers = [np.zeros(n_features) for i in range(n_procs - 1)]
        g = np.zeros(n_features)
        betaset = np.zeros((rounds, n_features))
        timeset = np.zeros(rounds)
        worker_timeset = np.zeros((rounds, n_procs - 1))

        request_set = []
        recv_reqs = []

        cnt_completed = 0

        status = MPI.Status()

        eta0 = params[2]  # ----- learning rate schedule
        alpha = params[1]  # --- coefficient of l2 regularization
        utemp = np.zeros(n_features)  # for accelerated gradient descent

    # Posting all Irecv requests for master and workers
    if (rank):

        for i in range(rounds):
            req = comm.Irecv([beta, MPI.DOUBLE], source=0, tag=i)
            recv_reqs.append(req)

    else:

        for i in range(rounds):
            recv_reqs = []
            for j in range(1, n_procs):
                req = comm.Irecv([msgBuffers[j - 1], MPI.DOUBLE],
                                 source=j,
                                 tag=i)
                recv_reqs.append(req)
            request_set.append(recv_reqs)

    ########################################################################################################
    comm.Barrier()

    if rank == 0:
        orig_start_time = time.time()
        print("---- Starting Naive Iterations ----")

    for i in range(rounds):

        if rank == 0:

            if (i % 10 == 0):
                print("\t >>> At Iteration %d" % (i))

            start_time = time.time()

            for l in range(1, n_procs):
                comm.Isend([beta, MPI.DOUBLE], dest=l, tag=i)

            g[:] = 0
            cnt_completed = 0

            while cnt_completed < n_procs - 1:
                req_done = MPI.Request.Waitany(request_set[i], status)
                src = status.Get_source()
                worker_timeset[i, src - 1] = time.time() - start_time
                request_set[i].pop(req_done)

                g += msgBuffers[src - 1]  # add the partial gradients
                cnt_completed += 1

            grad_multiplier = eta0[i] / n_samples
            # ---- update step for gradient descent
            if update_rule == "GD":
                np.subtract((1 - 2 * alpha * eta0[i]) * beta,
                            grad_multiplier * g,
                            out=beta)
            elif update_rule == "AGD":
                # ---- updates for accelerated gradient descent
                theta = 2.0 / (i + 2.0)
                ytemp = (1 - theta) * beta + theta * utemp
                betatemp = ytemp - grad_multiplier * g - (2 * alpha *
                                                          eta0[i]) * beta
                utemp = beta + (betatemp - beta) * (1 / theta)
                beta[:] = betatemp
            else:
                raise Exception("Error update rule")

            timeset[i] = time.time() - start_time
            betaset[i, :] = beta

        else:

            recv_reqs[i].Wait()

            # sendTestBuf = send_req.test()
            # if not sendTestBuf[0]:
            #     send_req.Cancel()

            predy = X_current.dot(beta)

            # TODO: gradient of linear regression
            #g = X_current.T.dot(np.divide(y_current,np.exp(np.multiply(predy,y_current))+1))
            g = X_current.T.dot(y_current - predy)
            g *= -2
            ########################################## straggler simulation ###################################################
            if add_delay == 1:
                np.random.seed(seed=i)
                #straggler_indices = np.random.choice([t for t in range(1, n_workers+1)], n_stragglers, replace=False)
                #if rank in straggler_indices:
                #    time.sleep(time_sleep)
                artificial_delays = np.random.exponential(0.5, n_workers)
                delay = artificial_delays[rank - 1]
                time.sleep(delay)
            ###################################################################################################################
            send_req = comm.Isend([g, MPI.DOUBLE], dest=0, tag=i)

    #####################################################################################################
    comm.Barrier()
    if rank == 0:
        elapsed_time = time.time() - orig_start_time
        print("Total Time Elapsed: %.3f" % (elapsed_time))
        # Load all training data
        if not is_real_data:
            X_train = load_data(input_dir + "1.dat")
            print(">> Loaded 1")
            for j in range(2, n_procs - 1):
                X_temp = load_data(input_dir + str(j) + ".dat")
                X_train = np.vstack((X_train, X_temp))
                print(">> Loaded " + str(j))
        else:
            X_train = load_sparse_csr(input_dir + "1")
            for j in range(2, n_procs - 1):
                X_temp = load_sparse_csr(input_dir + str(j))
                X_train = sps.vstack((X_train, X_temp))

        y_train = load_data(input_dir + "label.dat")
        y_train = y_train[0:X_train.shape[0]]

        # Load all testing data
        y_test = load_data(input_dir + "label_test.dat")
        if not is_real_data:
            X_test = load_data(input_dir + "test_data.dat")
        else:
            X_test = load_sparse_csr(input_dir + "test_data")

        n_train = X_train.shape[0]
        n_test = X_test.shape[0]

        training_loss = np.zeros(rounds)
        testing_loss = np.zeros(rounds)
        auc_loss = np.zeros(rounds)

        from sklearn.metrics import roc_curve, auc

        for i in range(rounds):
            beta = np.squeeze(betaset[i, :])
            predy_train = X_train.dot(beta)
            predy_test = X_test.dot(beta)
            training_loss[i] = calculate_mse(y_train, predy_train, n_train)
            testing_loss[i] = calculate_mse(y_test, predy_test, n_test)

            # TODOs: for linear regressiuon there is no fp tp any more, change to loss
            #fpr, tpr, thresholds = roc_curve(y_test,predy_test, pos_label=1)
            #auc_loss[i] = auc(fpr,tpr)
            print(
                "Iteration %d: Train Loss = %.6f, Test Loss = %.6f, Total time taken =%5.3f"
                % (i, training_loss[i], testing_loss[i], timeset[i]))

        output_dir = input_dir + "results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        #save_vector(training_loss, output_dir+"naive_acc_training_loss.dat")
        #save_vector(testing_loss, output_dir+"naive_acc_testing_loss.dat")
        #save_vector(auc_loss, output_dir+"naive_acc_auc.dat")
        #save_vector(timeset, output_dir+"naive_acc_timeset.dat")
        #save_matrix(worker_timeset, output_dir+"naive_acc_worker_timeset.dat")
        print(">>> Done")

    comm.Barrier()
Esempio n. 5
0
 def setUp(self):
     self.REQUESTS = [MPI.Request() for i in range(5)]
     self.STATUSES = [MPI.Status() for i in range(5)]
Esempio n. 6
0
 def setUp(self):
     self.REQUEST = MPI.Request()
     self.STATUS = MPI.Status()
Esempio n. 7
0
    def _sync(self):
        """
        Send output data and receive input data.
        """

        if self.time_sync:
            start = time.time()
        requests = []

        # For each destination module, extract elements from the current
        # module's port data array, copy them to a contiguous array, and
        # transmit the latter:
        for dest_id, dest_rank in zip(self._out_ids, self._out_ranks):

            # Copy data into destination buffer:
            if self._out_buf['gpot'][dest_id] is not None:
                set_by_inds(self._out_buf['gpot'][dest_id],
                            self._out_port_dict_ids['gpot'][dest_id],
                            self.data['gpot'], 'src')
                if not self.time_sync:
                    self.log_info('gpot data sent to %s: %s' % \
                                  (dest_id, str(self._out_buf['gpot'][dest_id])))
                r = MPI.COMM_WORLD.Isend([
                    self._out_buf_int['gpot'][dest_id],
                    self._out_buf_mtype['gpot'][dest_id]
                ], dest_rank, GPOT_TAG)
                requests.append(r)
            if self._out_buf['spike'][dest_id] is not None:
                set_by_inds(self._out_buf['spike'][dest_id],
                            self._out_port_dict_ids['spike'][dest_id],
                            self.data['spike'], 'src')
                if not self.time_sync:
                    self.log_info('spike data sent to %s: %s' % \
                                  (dest_id, str(self._out_buf['spike'][dest_id])))
                r = MPI.COMM_WORLD.Isend([
                    self._out_buf_int['spike'][dest_id],
                    self._out_buf_mtype['spike'][dest_id]
                ], dest_rank, SPIKE_TAG)
                requests.append(r)
            if not self.time_sync:
                self.log_info('sending to %s' % dest_id)
        if not self.time_sync:
            self.log_info('sent all data from %s' % self.id)

        # For each source module, receive elements and copy them into the
        # current module's port data array:
        for src_id, src_rank in zip(self._in_ids, self._in_ranks):
            if self._in_buf['gpot'][src_id] is not None:
                r = MPI.COMM_WORLD.Irecv([
                    self._in_buf_int['gpot'][src_id],
                    self._in_buf_mtype['gpot'][src_id]
                ],
                                         source=src_rank,
                                         tag=GPOT_TAG)
                requests.append(r)
            if self._in_buf['spike'][src_id] is not None:
                r = MPI.COMM_WORLD.Irecv([
                    self._in_buf_int['spike'][src_id],
                    self._in_buf_mtype['spike'][src_id]
                ],
                                         source=src_rank,
                                         tag=SPIKE_TAG)
                requests.append(r)
            if not self.time_sync:
                self.log_info('receiving from %s' % src_id)
        if requests:
            req = MPI.Request()
            req.Waitall(requests)
        if not self.time_sync:
            self.log_info('all data were received by %s' % self.id)

        # Copy received elements into the current module's data array:
        for src_id in self._in_ids:
            if self._in_buf['gpot'][src_id] is not None:
                if not self.time_sync:
                    self.log_info('gpot data received from %s: %s' % \
                                  (src_id, str(self._in_buf['gpot'][src_id])))
                set_by_inds_from_inds(
                    self.data['gpot'], self._in_port_dict_ids['gpot'][src_id],
                    self._in_buf['gpot'][src_id],
                    self._in_port_dict_buf_ids['gpot'][src_id])
            if self._in_buf['spike'][src_id] is not None:
                if not self.time_sync:
                    self.log_info('spike data received from %s: %s' % \
                                  (src_id, str(self._in_buf['spike'][src_id])))
                set_by_inds_from_inds(
                    self.data['spike'],
                    self._in_port_dict_ids['spike'][src_id],
                    self._in_buf['spike'][src_id],
                    self._in_port_dict_buf_ids['spike'][src_id])

        # Save timing data:
        if self.time_sync:
            stop = time.time()
            n_gpot = 0
            n_spike = 0
            for src_id in self._in_ids:
                n_gpot += len(self._in_buf['gpot'][src_id])
                n_spike += len(self._in_buf['spike'][src_id])
            self.log_info('sent timing data to master')
            self.intercomm.isend(['sync_time',
                                  (self.rank, self.steps, start, stop,
                                   n_gpot*self.pm['gpot'].dtype.itemsize+\
                                   n_spike*self.pm['spike'].dtype.itemsize)],
                                 dest=0, tag=self._ctrl_tag)
        else:
            self.log_info('saved all data received by %s' % self.id)
Esempio n. 8
0
    def _sync(self):
        """
        Send output data and receive input data.
        """

        if self.time_sync:
            start = time.time()
        req = MPI.Request()
        requests = []

        # For each destination module, extract elements from the current
        # module's port data array, copy them to a contiguous array, and
        # transmit the latter:
        for dest_id, dest_rank in zip(self._out_ids, self._out_ranks):

            # Get source ports in current module that are connected to the
            # destination module:
            data_gpot = self.pm['gpot'].get_by_inds(
                self._out_port_dict_ids['gpot'][dest_id])
            data_spike = self.pm['spike'].get_by_inds(
                self._out_port_dict_ids['spike'][dest_id])

            if not self.time_sync:
                self.log_info('gpot data being sent to %s: %s' % \
                              (dest_id, str(data_gpot)))
                self.log_info('spike data being sent to %s: %s' % \
                              (dest_id, str(data_spike)))
            r = MPI.COMM_WORLD.Isend(
                [data_gpot, dtype_to_mpi(data_gpot.dtype)], dest_rank,
                GPOT_TAG)
            requests.append(r)
            r = MPI.COMM_WORLD.Isend(
                [data_spike, dtype_to_mpi(data_spike.dtype)], dest_rank,
                SPIKE_TAG)
            requests.append(r)

            if not self.time_sync:
                self.log_info('sending to %s' % dest_id)
        if not self.time_sync:
            self.log_info('sent all data from %s' % self.id)

        # For each source module, receive elements and copy them into the
        # current module's port data array:
        received_gpot = []
        received_spike = []
        ind_in_gpot_list = []
        ind_in_spike_list = []
        for src_id, src_rank in zip(self._in_ids, self._in_ranks):
            r = MPI.COMM_WORLD.Irecv(
                [self.data_in['gpot'][src_id],
                 dtype_to_mpi(data_gpot.dtype)],
                source=src_rank,
                tag=GPOT_TAG)
            requests.append(r)
            r = MPI.COMM_WORLD.Irecv([
                self.data_in['spike'][src_id],
                dtype_to_mpi(data_spike.dtype)
            ],
                                     source=src_rank,
                                     tag=SPIKE_TAG)
            requests.append(r)
            if not self.time_sync:
                self.log_info('receiving from %s' % src_id)
        req.Waitall(requests)
        if not self.time_sync:
            self.log_info('received all data received by %s' % self.id)

        # Copy received elements into the current module's data array:
        for src_id in self._in_ids:
            ind_in_gpot = self._in_port_dict_ids['gpot'][src_id]
            self.pm['gpot'].set_by_inds(ind_in_gpot,
                                        self.data_in['gpot'][src_id])
            ind_in_spike = self._in_port_dict_ids['spike'][src_id]
            self.pm['spike'].set_by_inds(ind_in_spike,
                                         self.data_in['spike'][src_id])

        # Save timing data:
        if self.time_sync:
            stop = time.time()
            n_gpot = 0
            n_spike = 0
            for src_id in self._in_ids:
                n_gpot += len(self.data_in['gpot'][src_id])
                n_spike += len(self.data_in['spike'][src_id])
            self.log_info('sent timing data to master')
            self.intercomm.isend(['sync_time',
                                  (self.rank, self.steps, start, stop,
                                   n_gpot*self.pm['gpot'].dtype.itemsize+\
                                   n_spike*self.pm['spike'].dtype.itemsize)],
                                 dest=0, tag=self._ctrl_tag)
        else:
            self.log_info('saved all data received by %s' % self.id)
Esempio n. 9
0
def avoidstragg_logistic_regression(n_procs, n_samples, n_features, input_dir, n_stragglers, is_real_data, params):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    
    rounds = params[0]

    beta=np.zeros(n_features)

    # Loading data on workers
    if (rank):

        if not is_real_data:
            X_current = load_data(input_dir+str(rank)+".dat")
            y = load_data(input_dir+"label.dat")
        else:
            X_current = load_sparse_csr(input_dir+str(rank))
            y = load_data(input_dir+"label.dat")

        rows_per_worker = X_current.shape[0]
        y_current=y[(rank-1)*rows_per_worker:rank*rows_per_worker]
    
    # Initializing relevant variables
    if (rank):

        predy = X_current.dot(beta)
        g = -X_current.T.dot(np.divide(y_current,np.exp(np.multiply(predy,y_current))+1))
        send_req = MPI.Request()
        recv_reqs = []

    else:

        msgBuffers = [np.zeros(n_features) for i in range(n_procs-1)]
        g=np.zeros(n_features)
        betaset = np.zeros((rounds, n_features))
        timeset = np.zeros(rounds)
        worker_timeset=np.zeros((rounds, n_procs-1))
        
        request_set = []
        recv_reqs = []
        send_set = []


        cnt_completed = 0
        completed_workers = np.ndarray(n_procs-1,dtype=bool)

        status = MPI.Status()

        alpha = params[1] # --- coefficient of l2 regularization
        eta_sequence = params[2] # --- learning rate schedule
        utemp = np.zeros(n_features) # for accelerated gradient descent

    # Posting all Irecv requests for master and workers
    if (rank):

        for i in range(rounds):
            req = comm.Irecv([beta, MPI.DOUBLE], source=0, tag=i)
            recv_reqs.append(req)

    else:

        for i in range(rounds):
            recv_reqs = []
            for j in range(1,n_procs):
                req = comm.Irecv([msgBuffers[j-1], MPI.DOUBLE], source=j, tag=i)
                recv_reqs.append(req)
            request_set.append(recv_reqs)

    ##########################################################################################
    comm.Barrier()

    if rank==0:
        orig_start_time = time.time()
        print("---- Starting AvoidStragg Iterations with " +str(n_stragglers) + " stragglers ----")

    for i in range(rounds):
  
        if rank==0:

            if(i%10 == 0):
                print("\t >>> At Iteration %d" %(i))

            start_time = time.time()
            g[:]=0.0
            cnt_completed = 0
            completed_workers[:]=False

            send_set[:] = []

            for l in range(1,n_procs):
                sreq = comm.Isend([beta, MPI.DOUBLE], dest = l, tag = i)
                send_set.append(sreq)
            
            
            while cnt_completed < n_procs-1-n_stragglers:
                req_done = MPI.Request.Waitany(request_set[i], status)
                src = status.Get_source()
                worker_timeset[i,src-1]=time.time()-start_time
                request_set[i].pop(req_done)

                g += msgBuffers[src-1]   # add the partial gradients
                cnt_completed += 1
                completed_workers[src-1] = True

            grad_multiplier = eta_sequence[i]/(n_samples*(n_procs-1-n_stragglers)/(n_procs-1))
            # ---- update step for gradient descent
            # np.subtract((1-2*alpha*eta_sequence[i])*beta , grad_multiplier*g, out=beta)

            # ---- updates for accelerated gradient descent
            theta = 2.0/(i+2.0)
            ytemp = (1-theta)*beta + theta*utemp
            betatemp = ytemp - grad_multiplier*g - (2*alpha*eta_sequence[i])*beta
            utemp = beta + (betatemp-beta)*(1/theta)
            beta[:] = betatemp

            timeset[i] = time.time() - start_time

            betaset[i,:] = beta
            ind_set = [l for l in range(1,n_procs) if not completed_workers[l-1]]
            for l in ind_set:
                worker_timeset[i,l-1]=-1
            
            #MPI.Request.Waitall(send_set)
            #MPI.Request.Waitall(request_set[i])

        else:

            recv_reqs[i].Wait()

            sendTestBuf = send_req.test()
            if not sendTestBuf[0]:
                send_req.Cancel()
                #print("Worker " + str(rank) + " cancelled send request for Iteration " + str(i))

            predy = X_current.dot(beta)
            g = X_current.T.dot(np.divide(y_current,np.exp(np.multiply(predy,y_current))+1))
            g *= -1
            send_req = comm.Isend([g, MPI.DOUBLE], dest=0, tag=i)

    #########################################################################################
    comm.Barrier()

    if rank==0:
        elapsed_time= time.time() - orig_start_time
        print ("Total Time Elapsed: %.3f" %(elapsed_time))
        # Load all training data
        if not is_real_data:
            X_train = load_data(input_dir+"1.dat")
            for j in range(2,n_procs-1):
                X_temp = load_data(input_dir+str(j)+".dat")
                X_train = np.vstack((X_train, X_temp))
        else:
            X_train = load_sparse_csr(input_dir+"1")
            for j in range(2,n_procs-1):
                X_temp = load_sparse_csr(input_dir+str(j))
                X_train = sps.vstack((X_train, X_temp))

        y_train = load_data(input_dir+"label.dat")
        y_train = y_train[0:X_train.shape[0]]

        # Load all testing data
        y_test = load_data(input_dir + "label_test.dat")
        if not is_real_data:
            X_test = load_data(input_dir+"test_data.dat")
        else:
            X_test = load_sparse_csr(input_dir+"test_data")

        n_train = X_train.shape[0]
        n_test = X_test.shape[0]

        training_loss = np.zeros(rounds)
        testing_loss = np.zeros(rounds)
        auc_loss = np.zeros(rounds)

        from sklearn.metrics import roc_curve, auc

        for i in range(rounds):
            beta = np.squeeze(betaset[i,:])
            predy_train = X_train.dot(beta)
            predy_test = X_test.dot(beta)
            training_loss[i] = calculate_loss(y_train, predy_train, n_train)
            testing_loss[i] = calculate_loss(y_test, predy_test, n_test)
            fpr, tpr, thresholds = roc_curve(y_test,predy_test, pos_label=1)
            auc_loss[i] = auc(fpr,tpr)
            print("Iteration %d: Train Loss = %5.3f, Test Loss = %5.3f, AUC = %5.3f, Total time taken =%5.3f"%(i, training_loss[i], testing_loss[i], auc_loss[i], timeset[i]))
        
        output_dir = input_dir + "results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        save_vector(training_loss, output_dir+"avoidstragg_acc_%d_training_loss.dat"%(n_stragglers))
        save_vector(testing_loss, output_dir+"avoidstragg_acc_%d_testing_loss.dat"%(n_stragglers))
        save_vector(auc_loss, output_dir+"avoidstragg_acc_%d_auc.dat"%(n_stragglers))
        save_vector(timeset, output_dir+"avoidstragg_acc_%d_timeset.dat"%(n_stragglers))
        save_matrix(worker_timeset, output_dir+"avoidstragg_acc_%d_worker_timeset.dat"%(n_stragglers))
        print(">>> Done")

    comm.Barrier()
Esempio n. 10
0
    def __init__(self,
                 sel,
                 sel_in,
                 sel_out,
                 sel_gpot,
                 sel_spike,
                 data_gpot,
                 data_spike,
                 columns=['interface', 'io', 'type'],
                 ctrl_tag=CTRL_TAG,
                 gpot_tag=GPOT_TAG,
                 spike_tag=SPIKE_TAG,
                 id=None,
                 device=None,
                 routing_table=None,
                 rank_to_id=None,
                 debug=False,
                 time_sync=False):

        super(Module, self).__init__(ctrl_tag)
        self.debug = debug
        self.time_sync = time_sync
        self.device = device

        self._gpot_tag = gpot_tag
        self._spike_tag = spike_tag

        # Require several necessary attribute columns:
        if 'interface' not in columns:
            raise ValueError('interface column required')
        if 'io' not in columns:
            raise ValueError('io column required')
        if 'type' not in columns:
            raise ValueError('type column required')

        # Manually register the file close method associated with MPIOutput
        # so that it is called by atexit before MPI.Finalize() (if the file is
        # closed after MPI.Finalize() is called, an error will occur):
        for k, v in iteritems(twiggy.emitters):
            if isinstance(v._output, MPIOutput):
                atexit.register(v._output.close)

        # Ensure that the input and output port selectors respectively
        # select mutually exclusive subsets of the set of all ports exposed by
        # the module:
        if not SelectorMethods.is_in(sel_in, sel):
            raise ValueError(
                'input port selector not in selector of all ports')
        if not SelectorMethods.is_in(sel_out, sel):
            raise ValueError(
                'output port selector not in selector of all ports')
        if not SelectorMethods.are_disjoint(sel_in, sel_out):
            raise ValueError('input and output port selectors not disjoint')

        # Ensure that the graded potential and spiking port selectors
        # respectively select mutually exclusive subsets of the set of all ports
        # exposed by the module:
        if not SelectorMethods.is_in(sel_gpot, sel):
            raise ValueError('gpot port selector not in selector of all ports')
        if not SelectorMethods.is_in(sel_spike, sel):
            raise ValueError(
                'spike port selector not in selector of all ports')
        if not SelectorMethods.are_disjoint(sel_gpot, sel_spike):
            raise ValueError('gpot and spike port selectors not disjoint')

        # Save routing table and mapping between MPI ranks and module IDs:
        self.routing_table = routing_table
        self.rank_to_id = rank_to_id

        # Generate a unique ID if none is specified:
        if id is None:
            self.id = uid()
        else:

            # If a unique ID was specified and the routing table is not empty
            # (i.e., there are connections between multiple modules), the id
            # must be a node in the routing table:
            if routing_table is not None and len(routing_table.ids) and \
                    not routing_table.has_node(id):
                raise ValueError('routing table must contain specified '
                                 'module ID: {}'.format(id))
            self.id = id

        # Reformat logger name:
        LoggerMixin.__init__(self, 'mod %s' % self.id)

        # Create module interface given the specified ports:
        self.interface = Interface(sel, columns)

        # Set the interface ID to 0; we assume that a module only has one interface:
        self.interface[sel, 'interface'] = 0

        # Set the port attributes:
        self.interface[sel_in, 'io'] = 'in'
        self.interface[sel_out, 'io'] = 'out'
        self.interface[sel_gpot, 'type'] = 'gpot'
        self.interface[sel_spike, 'type'] = 'spike'

        # Find the input and output ports:
        self.in_ports = self.interface.in_ports().to_tuples()
        self.out_ports = self.interface.out_ports().to_tuples()

        # Find the graded potential and spiking ports:
        self.gpot_ports = self.interface.gpot_ports().to_tuples()
        self.spike_ports = self.interface.spike_ports().to_tuples()

        self.in_gpot_ports = self.interface.in_ports().gpot_ports().to_tuples()
        self.in_spike_ports = self.interface.in_ports().spike_ports(
        ).to_tuples()
        self.out_gpot_ports = self.interface.out_ports().gpot_ports(
        ).to_tuples()
        self.out_spike_ports = self.interface.out_ports().spike_ports(
        ).to_tuples()

        # Set up mapper between port identifiers and their associated data:
        if len(data_gpot) != len(self.gpot_ports):
            raise ValueError('incompatible gpot port data array length')
        if len(data_spike) != len(self.spike_ports):
            raise ValueError('incompatible spike port data array length')
        self.data = {}
        self.data['gpot'] = data_gpot
        self.data['spike'] = data_spike
        self.pm = {}
        self.pm['gpot'] = PortMapper(sel_gpot,
                                     self.data['gpot'],
                                     make_copy=False)
        self.pm['spike'] = PortMapper(sel_spike,
                                      self.data['spike'],
                                      make_copy=False)

        # MPI Request object for resolving asynchronous transfers:
        self.req = MPI.Request()
Esempio n. 11
0
def coded_logistic_regression(n_procs, n_samples, n_features, input_dir,
                              n_stragglers, is_real_data, params):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    rounds = params[0]

    n_workers = n_procs - 1
    rows_per_worker = n_samples / (n_procs - 1)

    # Loading the data
    if (rank):

        if not is_real_data:

            y = load_data(input_dir + "label.dat")
            X_current = np.zeros([(1 + n_stragglers) * rows_per_worker,
                                  n_features])
            y_current = np.zeros((1 + n_stragglers) * rows_per_worker)
            for i in range(1 + n_stragglers):
                X_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker, :] = load_data(input_dir + str(
                              (rank - 1 + i) % n_workers + 1) + ".dat")
                y_current[i * rows_per_worker:(i + 1) * rows_per_worker] = y[(
                    (rank - 1 + i) % n_workers) * rows_per_worker:(
                        (rank - 1 + i) % n_workers + 1) * rows_per_worker]
        else:

            y_current = np.zeros((1 + n_stragglers) * rows_per_worker)
            y = load_data(input_dir + "label.dat")
            for i in range(1 + n_stragglers):

                if i == 0:
                    X_current = load_sparse_csr(input_dir + str(
                        (rank - 1 + i) % n_workers + 1))
                else:
                    X_temp = load_sparse_csr(input_dir +
                                             str((rank - 1 + i) % n_workers +
                                                 1))
                    X_current = sps.vstack((X_current, X_temp))

                y_current[i * rows_per_worker:(i + 1) * rows_per_worker] = y[(
                    (rank - 1 + i) % n_workers) * rows_per_worker:(
                        (rank - 1 + i) % n_workers + 1) * rows_per_worker]

    # Initializing relevant variables
    B = np.zeros((n_workers, n_workers))
    beta = np.zeros(n_features)

    if (rank):

        predy = X_current.dot(beta)
        g = -X_current.T.dot(
            np.divide(y_current,
                      np.exp(np.multiply(predy, y_current)) + 1))
        send_req = MPI.Request()
        recv_reqs = []

    else:

        B = getB(n_workers, n_stragglers)
        # A = np.zeros((int(sp.binom(n_workers,n_stragglers)),n_workers))
        # A=getA(B,n_workers,n_stragglers)

        msgBuffers = np.array(
            [np.zeros(n_features) for i in range(n_procs - 1)])

        g = np.zeros(n_features)

        A_row = np.zeros((1, n_procs - 1))

        betaset = np.zeros((rounds, n_features))
        timeset = np.zeros(rounds)
        worker_timeset = np.zeros((rounds, n_procs - 1))

        request_set = []
        recv_reqs = []
        send_set = []

        cnt_completed = 0
        completed_workers = np.ndarray(n_procs - 1, dtype=bool)
        status = MPI.Status()

        eta0 = params[2]  # ----- learning rate
        alpha = params[1]  # --- coefficient of l2 regularization
        utemp = np.zeros(n_features)  # for accelerated gradient descent

    B = comm.bcast(B, root=0)

    # Setting up y_current_mod on all workers
    if (rank):
        y_current_mod = np.zeros((1 + n_stragglers) * rows_per_worker)
        for i in range(1 + n_stragglers):
            y_current_mod[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = B[rank - 1, (
                              (rank - 1 + i) % n_workers
                          )] * y_current[i * rows_per_worker:(i + 1) *
                                         rows_per_worker]

    # Posting all Irecv requests for master and workers
    if (rank):

        for i in range(rounds):
            req = comm.Irecv([beta, MPI.DOUBLE], source=0, tag=i)
            recv_reqs.append(req)
    else:

        for i in range(rounds):
            recv_reqs = []
            for j in range(1, n_procs):
                req = comm.Irecv([msgBuffers[j - 1], MPI.DOUBLE],
                                 source=j,
                                 tag=i)
                recv_reqs.append(req)
            request_set.append(recv_reqs)

    #######################################################################################################
    comm.Barrier()
    if rank == 0:
        print("---- Starting Coded Iterations for " + str(n_stragglers) +
              " stragglers ----")
        orig_start_time = time.time()

    for i in range(rounds):

        if rank == 0:

            if (i % 10 == 0):
                print("\t >>> At Iteration %d" % (i))

            A_row[:] = 0
            send_set[:] = []
            completed_workers[:] = False
            cnt_completed = 0

            start_time = time.time()

            for l in range(1, n_procs):
                sreq = comm.Isend([beta, MPI.DOUBLE], dest=l, tag=i)
                send_set.append(sreq)

            while cnt_completed < n_procs - 1 - n_stragglers:
                req_done = MPI.Request.Waitany(request_set[i], status)
                src = status.Get_source()
                worker_timeset[i, src - 1] = time.time() - start_time
                request_set[i].pop(req_done)

                cnt_completed += 1
                completed_workers[src - 1] = True

            completed_ind_set = [
                l for l in range(n_procs - 1) if completed_workers[l]
            ]
            A_row[0, completed_ind_set] = np.linalg.lstsq(
                B[completed_ind_set, :].T, np.ones(n_workers))[0]
            g = np.squeeze(np.dot(A_row, msgBuffers))

            # case_idx = calculate_indexA(completed_stragglers)
            # g = np.dot(A[case_idx,ind_set],tmpBuff)

            grad_multiplier = eta0[i] / n_samples
            # ---- update step for gradient descent
            # np.subtract((1-2*alpha*eta0[i])*beta , grad_multiplier*g, out=beta)

            # ---- updates for accelerated gradient descent
            theta = 2.0 / (i + 2.0)
            ytemp = (1 - theta) * beta + theta * utemp
            betatemp = ytemp - grad_multiplier * g - (2 * alpha *
                                                      eta0[i]) * beta
            utemp = beta + (betatemp - beta) * (1 / theta)
            beta[:] = betatemp

            timeset[i] = time.time() - start_time
            betaset[i, :] = beta

            ind_set = [
                l for l in range(n_procs - 1) if not completed_workers[l]
            ]
            for l in ind_set:
                worker_timeset[i, l] = -1

        else:
            recv_reqs[i].Wait()

            sendTestBuf = send_req.test()
            if not sendTestBuf[0]:
                send_req.Cancel()
                #print("Worker " + str(rank) + " cancelled send request for Iteration " + str(i))

            predy = X_current.dot(beta)
            g = X_current.T.dot(
                np.divide(y_current_mod,
                          np.exp(np.multiply(predy, y_current)) + 1))
            g *= -1
            send_req = comm.Isend([g, MPI.DOUBLE], dest=0, tag=i)

    #####################################################################################################
    comm.Barrier()
    if rank == 0:
        elapsed_time = time.time() - orig_start_time
        print("Total Time Elapsed: %.3f" % (elapsed_time))
        # Load all training data
        if not is_real_data:
            X_train = load_data(input_dir + "1.dat")
            print(">> Loaded 1")
            for j in range(2, n_procs - 1):
                X_temp = load_data(input_dir + str(j) + ".dat")
                X_train = np.vstack((X_train, X_temp))
                print(">> Loaded " + str(j))
        else:
            X_train = load_sparse_csr(input_dir + "1")
            for j in range(2, n_procs - 1):
                X_temp = load_sparse_csr(input_dir + str(j))
                X_train = sps.vstack((X_train, X_temp))

        y_train = load_data(input_dir + "label.dat")
        y_train = y_train[0:X_train.shape[0]]

        # Load all testing data
        y_test = load_data(input_dir + "label_test.dat")
        if not is_real_data:
            X_test = load_data(input_dir + "test_data.dat")
        else:
            X_test = load_sparse_csr(input_dir + "test_data")

        n_train = X_train.shape[0]
        n_test = X_test.shape[0]

        training_loss = np.zeros(rounds)
        testing_loss = np.zeros(rounds)
        auc_loss = np.zeros(rounds)

        from sklearn.metrics import roc_curve, auc

        for i in range(rounds):
            beta = np.squeeze(betaset[i, :])
            predy_train = X_train.dot(beta)
            predy_test = X_test.dot(beta)
            training_loss[i] = calculate_loss(y_train, predy_train, n_train)
            testing_loss[i] = calculate_loss(y_test, predy_test, n_test)
            fpr, tpr, thresholds = roc_curve(y_test, predy_test, pos_label=1)
            auc_loss[i] = auc(fpr, tpr)
            print(
                "Iteration %d: Train Loss = %5.3f, Test Loss = %5.3f, AUC = %5.3f, Total time taken =%5.3f"
                % (i, training_loss[i], testing_loss[i], auc_loss[i],
                   timeset[i]))

        output_dir = input_dir + "results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        save_vector(
            training_loss,
            output_dir + "coded_acc_%d_training_loss.dat" % (n_stragglers))
        save_vector(
            testing_loss,
            output_dir + "coded_acc_%d_testing_loss.dat" % (n_stragglers))
        save_vector(auc_loss,
                    output_dir + "coded_acc_%d_auc.dat" % (n_stragglers))
        save_vector(timeset,
                    output_dir + "coded_acc_%d_timeset.dat" % (n_stragglers))
        save_matrix(
            worker_timeset,
            output_dir + "coded_acc_%d_worker_timeset.dat" % (n_stragglers))
        print(">>> Done")

    comm.Barrier()
Esempio n. 12
0
    def spawn(self, **kwargs):
        """
        Spawn MPI processes for and execute each of the managed targets.

        Parameters
        ----------
        kwargs: dict
                options for the `info` argument in mpi spawn process.
                see https://www.open-mpi.org/doc/v4.0/man3/MPI_Comm_spawn.3.php
        """

        # Typcially MPI must be have intialized before spawning.
        if not MPI.Is_initialized():
            MPI.Init()

        if self._is_parent:
            # Find the path to the mpi_backend.py script (which should be in the
            # same directory as this module:
            parent_dir = os.path.dirname(__file__)
            mpi_backend_path = os.path.join(parent_dir, 'mpi_backend.py')

            # Set spawn option. Due to --oversubscribe, we will use none in binding
            info = Info.Create()
            info.Set('bind_to', "none")

            for k, v in kwargs.items():
                info.Set(k, v)

            # Spawn processes:
            self._intercomm = MPI.COMM_SELF.Spawn(sys.executable,
                                                  args=[mpi_backend_path],
                                                  maxprocs=len(self),
                                                  info=info)

            # First, transmit twiggy logging emitters to spawned processes so
            # that they can configure their logging facilities:
            for i in self._targets:
                self._intercomm.send(twiggy.emitters, i)

            # Next, serialize the routing table ONCE and then transmit it to all
            # of the child nodes:
            try:
                routing_table = self.routing_table
            except:
                routing_table = RoutingTable()
                self.log_warning(
                    'Routing Table is null, using empty routing table.')

            self._intercomm.bcast(routing_table, root=MPI.ROOT)

            # Transmit class to instantiate, globals required by the class, and
            # the constructor arguments; the backend will wait to receive
            # them and then start running the targets on the appropriate nodes.
            req = MPI.Request()
            r_list = []
            for i in self._targets:
                target_globals = all_global_vars(self._targets[i])

                # Serializing atexit with dill appears to fail in virtualenvs
                # sometimes if atexit._exithandlers contains an unserializable function:
                if 'atexit' in target_globals:
                    del target_globals['atexit']
                data = (self._targets[i], target_globals, self._kwargs[i])
                r_list.append(self._intercomm.isend(data, i))

                # Need to clobber data to prevent all_global_vars from
                # including it in its output:
                del data
            req.Waitall(r_list)
def controller(lower, upper):

    #Set up the basic MPI stuff
    comm = MPI.COMM_WORLD
    nproc = comm.Get_size()
    rank = comm.Get_rank()

    #Setup values for array of flags
    length = upper - lower
    flags = numpy.zeros(length)
    #Offset of last dispatched value
    current_val = 0

    #Number of in-flight work packets
    inflight = 0

    precheck_num = 0
    #How many primes to process
    precheck_to = 20

    #Arrays holding data per worker:
    #Value last sent to worker
    vals_in_use = numpy.zeros(nproc - 1)

    #Workers stats - how many processed in how long
    processed = numpy.zeros(nproc - 1)
    start_time = numpy.zeros(nproc - 1)
    cum_time = numpy.zeros(nproc - 1)
    end_time = numpy.zeros(nproc - 1)

    #Some things need to have the correct type BEFORE the MPI calls
    info = MPI.Status()
    request = MPI.Request()
    while True:

        #Use non-blocking commands although this variant could just as well use blocking
        #and not post the recieve until after it did the pre-check

        # Unlike normal MPI, irecv here takes a buffer size only and
        # the actual result is returned by the wait
        # First param is buffer size in bytes.
        request = comm.irecv(4, source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG)

        #Do some work before waiting
        if precheck_num < precheck_to:
            precheck_flags(lower, length, flags, precheck_num)
            precheck_num = precheck_num + 1

        result = request.wait(status=info)

        if info.tag > 0:
            # Capture stats
            end_time[info.source - 1] = time.time()
            cum_time[info.source - 1] = cum_time[info.source - 1] + (
                end_time[info.source - 1] - start_time[info.source - 1])
            processed[info.source - 1] = processed[info.source - 1] + 1
            offset = vals_in_use[info.source - 1] - lower
            #Store result
            #Cheat - if prime mark as 2, (True + 1), else as composite, 1 (False+1)
            flags[int(offset)] = result + 1
            inflight = inflight - 1

        if current_val < length:
            #If there is still work to do, reply with next package

            #Skip any values that have already been checked i.e. are 1 or 2
            #The precheck_flags routine may mark some things as composite
            while flags[current_val] != 0 and current_val < length:
                current_val = current_val + 1

            vals_in_use[info.source - 1] = lower + current_val
            #print("Dispatching ", lower+current_val)

            start_time[info.source - 1] = time.time()
            current_val = current_val + 1
            comm.send(vals_in_use[info.source - 1], dest=info.source, tag=1)
            inflight = inflight + 1
        else:
            #No more work, shut down the worker
            comm.send(1, dest=info.source, tag=0)

        if inflight == 0:
            #Nothing is in flight, all done
            break

    #Summarize findings
    for i in range(0, nproc - 1):
        print("Worker ", i, " processed ", int(processed[i - 1]),
              " packets in ", cum_time[i - 1], "s")

    #Total the number of elements marked prime (==2) and divide by 2 to get number
    print("Found ", int(numpy.sum(flags[flags == 2]) / 2), " primes")
Esempio n. 14
0
def diffusion(U, S, t, tt):
    
    # shortcuts
    discretization = data.discretization
    domain = data.domain
    alpha = discretization.alpha
    beta = discretization.beta
    nx = domain.nx
    ny = domain.ny

    # we initialize the strucutre for saving the requests
    # and the statuses
    statuses = [MPI.Status()] * 8
    requests = [MPI.Request()] * 8
    comm_cart = domain.comm_cart
    num_requests = 0

    # !! non-blocking communication !!

    # send the North boundary to the north neigbour 
    if domain.neighbour_north >= 0:
        # set tag to be the sender's rank
        # post receive
        requests[num_requests] = comm_cart.Irecv([data.bndN, MPI.DOUBLE], domain.neighbour_north, domain.neighbour_north)
        num_requests += 1

        # pack north buffer
        data.buffN[0, :] = U[ny-1, :]

        requests[num_requests] = comm_cart.Isend([data.buffN, MPI.DOUBLE], domain.neighbour_north, domain.rank)
        num_requests += 1
    
    # same for South
    if domain.neighbour_south >= 0:
        # set tag to be the sender's rank
        # post receive
        requests[num_requests] = comm_cart.Irecv([data.bndS, MPI.DOUBLE], domain.neighbour_south, domain.neighbour_south)
        num_requests += 1

        # pack south buffer
        data.buffS[0, :] = U[0, :]

        requests[num_requests] = comm_cart.Isend([data.buffS, MPI.DOUBLE], domain.neighbour_south, domain.rank)  
        num_requests += 1

    # same for East
    if domain.neighbour_east >= 0:
        # set tag to be the sender's rank
        # post receive
        requests[num_requests] = comm_cart.Irecv([data.bndE, MPI.DOUBLE], domain.neighbour_east, domain.neighbour_east)
        num_requests += 1

        # pack east buffer
        data.buffE[0, :] = U[:, nx-1]
    
        requests[num_requests] = comm_cart.Isend([data.buffE, MPI.DOUBLE], domain.neighbour_east, domain.rank)  
        num_requests += 1

    # same for West
    if domain.neighbour_west >= 0:
        # set tag to be the sender's rank
        # post receive
        requests[num_requests] = comm_cart.Irecv([data.bndW, MPI.DOUBLE], domain.neighbour_west, domain.neighbour_west)
        num_requests += 1

        # pack west buffer
        data.buffW[0, :] = U[:, 0]

        requests[num_requests] = comm_cart.Isend([data.buffW, MPI.DOUBLE], domain.neighbour_west, domain.rank) 
        num_requests += 1

    srow = domain.ny - 1
    scol = domain.nx - 1

    # the non-blocking communication allows the communications to take places
    # and while waiting we can calculate the interior grid points for each domain
    # srow and scol NOT INCLUDED in the slice operator (work like range())
    # S is y^(l+1)
    # U is x^(l)
    # data.x_old is x^(l-1)
    S[1:srow, 1:scol] = ( -(4.0 + alpha) * U[1:srow, 1:scol] 
                        + U[1-1:srow-1, 1:scol] + U[1+1:srow+1, 1:scol] 
                        + U[1:srow, 1-1:scol-1] + U[1:srow, 1+1:scol+1]
                        + beta * U[1:srow, 1:scol] * (1.0 - U[1:srow, 1:scol]) 
                        + alpha * data.x_old[1:srow, 1:scol] )

    # wait for all communication to succeed before calculating the boundaries of each subdomain
    MPI.Request.Waitall(requests, statuses)

    # east boundary
    srow = domain.ny - 1
    scol = domain.nx - 1

    S[1:srow, scol] = ( -(4.0 + alpha) * U[1:srow, scol]
                        + U[1-1:srow-1, scol] + U[1+1:srow+1, scol] 
                        + U[1:srow, scol-1] + data.bndE[0, 1:srow]
                        + beta * U[1:srow, scol] * (1.0 - U[1:srow, scol])
                        + alpha * data.x_old[1:srow, scol]  )

    srow = domain.ny - 1
    scol = 0
    
    # west boundary
    S[1:srow, scol] = ( -(4.0 + alpha) * U[1:srow, scol]
                        + U[1:srow, scol+1] + U[1-1:srow-1, scol] + U[1+1:srow+1, scol]
                        + alpha * data.x_old[1:srow, scol] + data.bndW[0, 1:srow]
                        + beta * U[1:srow, scol] * (1.0 - U[1:srow, scol]) )
    
    # North boundary
    srow = domain.ny - 1
    
    # NW corner
    scol = 0
    S[srow, scol] = ( -(4.0 + alpha) * U[srow, scol]
                        + U[srow-1, scol] + data.bndN[0][scol] 
                        + data.bndW[0, srow] + U[srow, scol+1]
                        + beta * U[srow, scol] * (1.0 - U[srow, scol])
                        + alpha * data.x_old[srow, scol])
    
    # north boundary
    scol = domain.nx - 1
    S[srow, 1:scol] = ( -(4.0 + alpha) * U[srow, 1:scol]
                        + U[srow, 1-1:scol-1] + U[srow, 1+1:scol+1] + U[srow-1, 1:scol]
                        + alpha * data.x_old[srow, 1:scol] + data.bndN[0, 1:scol]
                        + beta * U[srow, 1:scol] * (1.0 - U[srow, 1:scol]) )

    # NE corner
    scol = domain.nx - 1
    S[srow, scol] = ( -(4.0 + alpha) * U[srow, scol]
                        + U[srow, scol-1] + U[srow-1, scol]
                        + alpha * data.x_old[srow, scol] + data.bndE[0, srow] + data.bndN[0, scol]
                        + beta * U[srow, scol] * (1.0 - U[srow, scol]) )

    # South boundary
    srow = 0
    
    # SW corner
    scol = 0
    S[srow, scol] = ( -(4.0 + alpha) * U[srow, scol]
                        + U[srow, scol+1] + U[srow+1, scol]
                        + alpha * data.x_old[srow, scol] + data.bndW[0, srow] + data.bndS[0, scol]
                        + beta * U[srow, scol] * (1.0 - U[srow, scol]) )
    
    # south boundary
    scol = domain.nx - 1
    S[srow, 1:scol] = ( -(4.0 + alpha) * U[srow, 1:scol]
                        + U[srow, 1-1:scol-1] + U[srow, 1+1:scol+1] + U[srow+1, 1:scol]
                        + alpha * data.x_old[srow, 1:scol] + data.bndS[0, 1:scol]
                        + beta * U[srow, 1:scol] * (1.0 - U[srow, 1:scol]) )

    # SE corner
    scol = domain.nx - 1
    S[srow, scol] = ( -(4.0 + alpha) * U[srow, scol]
                        + U[srow, scol-1] + U[srow+1, scol]
                        + alpha * data.x_old[srow, scol] + data.bndE[0, srow] + data.bndS[0, scol]
                        + beta * U[srow, scol] * (1.0 - U[srow, scol]) )

    # Statistics
    # Update the flop counts
    # 8 flops per point
    data.flops_count += (
            + 12 * (nx - 2) * (ny - 2)  # interior points
            + 11 * (nx - 2  +  ny - 2)  # all boundaries points
            + 11 * 4 )                  # corner points
Esempio n. 15
0
def replication_logistic_regression(n_procs, n_samples, n_features, input_dir,
                                    n_stragglers, is_real_data, params,
                                    add_delay, update_rule):

    assert update_rule in ('GD', 'AGD')

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    n_workers = n_procs - 1

    if (n_workers % (n_stragglers + 1)):
        print("Error: n_workers must be multiple of n_stragglers+1!")
        sys.exit(0)

    rounds = params[0]

    #beta=np.zeros(n_features)
    beta = np.random.randn(n_features)

    #rows_per_worker=n_samples/(n_procs-1)
    rows_per_worker = n_samples // (n_procs - 1)
    n_groups = n_workers / (n_stragglers + 1)

    # Loading the data
    if (rank):

        if not is_real_data:

            X_current = np.zeros(
                ((1 + n_stragglers) * rows_per_worker, n_features))
            y_current = np.zeros((1 + n_stragglers) * rows_per_worker)
            y = load_data(input_dir + "label.dat")

            for i in range(1 + n_stragglers):
                a = (rank - 1) / (n_stragglers + 1)  # index of group
                b = (rank - 1) % (n_stragglers + 1
                                  )  # position inside the group
                idx = int((n_stragglers + 1) * a + (b + i) %
                          (n_stragglers + 1))

                X_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker, :] = load_data(input_dir +
                                                          str(idx + 1) +
                                                          ".dat")
                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

        else:

            y_current = np.zeros((1 + n_stragglers) * rows_per_worker)
            y = load_data(input_dir + "label.dat")
            for i in range(1 + n_stragglers):
                a = (rank - 1) / (n_stragglers + 1)  # index of group
                b = (rank - 1) % (n_stragglers + 1
                                  )  # position inside the group
                idx = int((n_stragglers + 1) * a + (b + i) %
                          (n_stragglers + 1))

                if i == 0:
                    X_current = load_sparse_csr(input_dir + str(idx + 1))
                else:
                    X_temp = load_sparse_csr(input_dir + str(idx + 1))
                    X_current = sps.vstack((X_current, X_temp))
                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

    # Initializing relevant variables
    if (rank):

        predy = X_current.dot(beta)
        g = -X_current.T.dot(
            np.divide(y_current,
                      np.exp(np.multiply(predy, y_current)) + 1))
        send_req = MPI.Request()
        recv_reqs = []

    else:

        msgBuffers = [np.zeros(n_features) for i in range(n_procs - 1)]
        g = np.zeros(n_features)
        betaset = np.zeros((rounds, n_features))
        timeset = np.zeros(rounds)
        worker_timeset = np.zeros((rounds, n_procs - 1))

        request_set = []
        recv_reqs = []
        send_set = []

        cnt_groups = 0
        completed_groups = np.ndarray(n_groups, dtype=bool)
        completed_workers = np.ndarray(n_procs - 1, dtype=bool)

        status = MPI.Status()

        eta0 = params[2]  # ----- learning rate
        alpha = params[1]  # --- coefficient of l2 regularization
        utemp = np.zeros(n_features)  # for accelerated gradient descent

    # Posting all Irecv requests for master and workers
    if (rank):

        for i in range(rounds):
            req = comm.Irecv([beta, MPI.DOUBLE], source=0, tag=i)
            recv_reqs.append(req)
    else:

        for i in range(rounds):
            recv_reqs = []
            for j in range(1, n_procs):
                req = comm.Irecv([msgBuffers[j - 1], MPI.DOUBLE],
                                 source=j,
                                 tag=i)
                recv_reqs.append(req)
            request_set.append(recv_reqs)

    ###########################################################################################
    comm.Barrier()
    if rank == 0:
        print("---- Starting Replication Iterations for " + str(n_stragglers) +
              " stragglers" + "simulated delay " + str(add_delay) + "-------")
        orig_start_time = time.time()

    for i in range(rounds):
        if rank == 0:

            if (i % 10 == 0):
                print("\t >>> At Iteration %d" % (i))

            send_set[:] = []
            g[:] = 0
            completed_groups[:] = False
            completed_workers[:] = False
            cnt_groups = 0
            cnt_workers = 0

            start_time = time.time()

            # bcast model step
            for l in range(1, n_procs):
                sreq = comm.Isend([beta, MPI.DOUBLE], dest=l, tag=i)
                send_set.append(sreq)

            while cnt_groups < n_groups:
                req_done = MPI.Request.Waitany(request_set[i], status)
                src = status.Get_source()
                worker_timeset[i, src - 1] = time.time() - start_time
                request_set[i].pop(req_done)

                completed_workers[src - 1] = True
                groupid = (src - 1) / (n_stragglers + 1)

                if (not completed_groups[groupid]):
                    completed_groups[groupid] = True
                    g += msgBuffers[src - 1]
                    cnt_groups += 1

            grad_multiplier = eta0[i] / n_samples
            # ---- update step for gradient descent
            if update_rule == "GD":
                np.subtract((1 - 2 * alpha * eta0[i]) * beta,
                            grad_multiplier * g,
                            out=beta)
            elif update_rule == "AGD":
                # ---- updates for accelerated gradient descent
                theta = 2.0 / (i + 2.0)
                ytemp = (1 - theta) * beta + theta * utemp
                betatemp = ytemp - grad_multiplier * g - (2 * alpha *
                                                          eta0[i]) * beta
                utemp = beta + (betatemp - beta) * (1 / theta)
                beta[:] = betatemp
            else:
                raise Exception("Error update rule")

            timeset[i] = time.time() - start_time

            betaset[i, :] = beta
            ind_set = [
                l for l in range(1, n_procs) if not completed_workers[l - 1]
            ]
            for l in ind_set:
                worker_timeset[i, l - 1] = -1

            MPI.Request.Waitall(send_set)
            MPI.Request.Waitall(request_set[i])

        else:

            recv_reqs[i].Wait()

            sendTestBuf = send_req.test()
            if not sendTestBuf[0]:
                send_req.Cancel()
                #print("Worker " + str(rank) + " cancelled send request for Iteration " + str(i))

            predy = X_current.dot(beta)
            g = X_current.T.dot(
                np.divide(y_current,
                          np.exp(np.multiply(predy, y_current)) + 1))
            g *= -1

            ########################################## straggler simulation ###################################################
            if add_delay == 1:
                np.random.seed(seed=i)
                #straggler_indices = np.random.choice([t for t in range(1, n_workers+1)], n_stragglers, replace=False)
                #if rank in straggler_indices:
                #    time.sleep(time_sleep)
                artificial_delays = np.random.exponential(0.5, n_workers)
                delay = artificial_delays[rank - 1]
                time.sleep(delay)
            ###################################################################################################################

            send_req = comm.Isend([g, MPI.DOUBLE], dest=0, tag=i)

    #############################################################################################
    comm.Barrier()
    if rank == 0:
        elapsed_time = time.time() - orig_start_time
        print("Total Time Elapsed: %.3f" % (elapsed_time))
        # Load all training data
        if not is_real_data:
            X_train = load_data(input_dir + "1.dat")
            for j in range(2, n_procs - 1):
                X_temp = load_data(input_dir + str(j) + ".dat")
                X_train = np.vstack((X_train, X_temp))
        else:
            X_train = load_sparse_csr(input_dir + "1")
            for j in range(2, n_procs - 1):
                X_temp = load_sparse_csr(input_dir + str(j))
                X_train = sps.vstack((X_train, X_temp))

        y_train = load_data(input_dir + "label.dat")
        y_train = y_train[0:X_train.shape[0]]

        # Load all testing data
        y_test = load_data(input_dir + "label_test.dat")
        if not is_real_data:
            X_test = load_data(input_dir + "test_data.dat")
        else:
            X_test = load_sparse_csr(input_dir + "test_data")

        n_train = X_train.shape[0]
        n_test = X_test.shape[0]

        training_loss = np.zeros(rounds)
        testing_loss = np.zeros(rounds)
        auc_loss = np.zeros(rounds)

        from sklearn.metrics import roc_curve, auc

        for i in range(rounds):
            beta = np.squeeze(betaset[i, :])
            predy_train = X_train.dot(beta)
            predy_test = X_test.dot(beta)
            training_loss[i] = calculate_loss(y_train, predy_train, n_train)
            testing_loss[i] = calculate_loss(y_test, predy_test, n_test)
            fpr, tpr, thresholds = roc_curve(y_test, predy_test, pos_label=1)
            auc_loss[i] = auc(fpr, tpr)
            print(
                "Iteration %d: Train Loss = %5.3f, Test Loss = %5.3f, AUC = %5.3f, Total time taken =%5.3f"
                % (i, training_loss[i], testing_loss[i], auc_loss[i],
                   timeset[i]))

        output_dir = input_dir + "results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        save_vector(
            training_loss, output_dir +
            "replication_acc_%d_training_loss.dat" % (n_stragglers))
        save_vector(
            testing_loss, output_dir + "replication_acc_%d_testing_loss.dat" %
            (n_stragglers))
        save_vector(auc_loss,
                    output_dir + "replication_acc_%d_auc.dat" % (n_stragglers))
        save_vector(
            timeset,
            output_dir + "replication_acc_%d_timeset.dat" % (n_stragglers))
        save_matrix(
            worker_timeset, output_dir +
            "replication_acc_%d_worker_timeset.dat" % (n_stragglers))
        print(">>> Done")

    comm.Barrier()
Esempio n. 16
0
    def _sync(self):
        """
        Send output data and receive input data.
        """

        if self.time_sync:
            start = time.time()
        req = MPI.Request()
        requests = []

        # Transmit the entire port data array to each destination module:
        dest_ids = self.routing_table.dest_ids(self.id)
        for dest_id in dest_ids:
            dest_rank = self.rank_to_id[:dest_id]
            r = MPI.COMM_WORLD.Isend(
                [self._data_int['gpot'], self._data_mtype['gpot']], dest_rank,
                GPOT_TAG)
            requests.append(r)
            r = MPI.COMM_WORLD.Isend(
                [self._data_int['spike'], self._data_mtype['spike']],
                dest_rank, SPIKE_TAG)
            requests.append(r)

            if not self.time_sync:
                self.log_info('sending to %s' % dest_id)
        if not self.time_sync:
            self.log_info('sent all data from %s' % self.id)

        # For each source module, receive elements and copy them into the
        # current module's port data array:
        src_ids = self.routing_table.src_ids(self.id)
        for src_id in src_ids:
            src_rank = self.rank_to_id[:src_id]
            r = MPI.COMM_WORLD.Irecv([
                self._in_buf_int['gpot'][src_id],
                self._in_buf_mtype['gpot'][src_id]
            ],
                                     source=src_rank,
                                     tag=GPOT_TAG)
            requests.append(r)
            r = MPI.COMM_WORLD.Irecv([
                self._in_buf_int['spike'][src_id],
                self._in_buf_mtype['spike'][src_id]
            ],
                                     source=src_rank,
                                     tag=SPIKE_TAG)
            requests.append(r)
            if not self.time_sync:
                self.log_info('receiving from %s' % src_id)
        req.Waitall(requests)
        if not self.time_sync:
            self.log_info('received all data received by %s' % self.id)

        # Copy received elements into the current module's data array:
        n_gpot = 0
        n_spike = 0
        for src_id in src_ids:
            ind_from_gpot = self._from_port_dict_ids['gpot'][src_id]
            ind_in_gpot = self._in_port_dict_ids['gpot'][src_id]
            set_by_inds_from_inds(self.data['gpot'], ind_in_gpot,
                                  self._in_buf['gpot'][src_id], ind_from_gpot)
            n_gpot += len(self._in_buf['gpot'][src_id])
            ind_from_spike = self._from_port_dict_ids['spike'][src_id]
            ind_in_spike = self._in_port_dict_ids['spike'][src_id]
            set_by_inds_from_inds(self.data['spike'], ind_in_spike,
                                  self._in_buf['spike'][src_id],
                                  ind_from_spike)
            n_spike += len(self._in_buf['spike'][src_id])

        # Save timing data:
        if self.time_sync:
            stop = time.time()
            #self.log_info('sent timing data to master')
            self.intercomm.isend(['time', (self.rank, self.steps, start, stop,
                n_gpot*self.pm['gpot'].dtype.itemsize+\
                n_spike*self.pm['spike'].dtype.itemsize)],
                    dest=0, tag=self._ctrl_tag)
        else:
            self.log_info('saved all data received by %s' % self.id)
Esempio n. 17
0
    def run(self, steps=0):
        """
        Main body of worker process.
        """

        #self.pre_run()
        self.catch_exception_run(self.pre_run)
        self.pbar = tqdm(desc=self.progressbar_name(), position=self.rank)

        self.log_info('running body of worker %s' % self.rank)

        # Start listening for control messages from parent process:
        if self.manager:
            r_ctrl = []
            try:
                d = self.intercomm.irecv(source=0, tag=self._ctrl_tag)
            except TypeError:
                # irecv() in mpi4py 1.3.1 stable uses 'dest' instead of 'source':
                d = self.intercomm.irecv(dest=0, tag=self._ctrl_tag)
            r_ctrl.append(d)
            req = MPI.Request()

        running = False
        self.steps = 0
        if not self.manager:
            self.max_steps = steps
            self.pbar.total = self.max_steps
            running = True
        while True:
            if self.manager:
                # Handle control messages (this assumes that only one control
                # message will arrive at a time):
                flag, msg_list = req.testall(r_ctrl)
                if flag:
                    msg = msg_list[0]

                    # Start executing work method:
                    if msg[0] == 'start':
                        self.log_info('starting')
                        running = True

                    # Stop executing work method::
                    elif msg[0] == 'stop':
                        if self.max_steps == float('inf'):
                            self.log_info('stopping')
                            running = False
                        else:
                            self.log_info('max steps set - not stopping')

                    # Set maximum number of execution steps:
                    elif msg[0] == 'steps':
                        if msg[1] == 'inf':
                            self.max_steps = float('inf')
                        else:
                            self.max_steps = int(msg[1])
                        self.pbar.total = self.max_steps
                        self.log_info('setting maximum steps to %s' %
                                      self.max_steps)

                    # Quit:
                    elif msg[0] == 'quit':
                        # if self.max_steps == float('inf'):
                        self.log_info('quitting')
                        break
                        # else:
                        #     self.log_info('max steps set - not quitting')

                    # Get next message:
                    r_ctrl = []
                    try:
                        d = self.intercomm.irecv(source=0, tag=self._ctrl_tag)
                    except TypeError:
                        # irecv() in mpi4py 1.3.1 stable uses 'dest' instead of 'source':
                        d = self.intercomm.irecv(dest=0, tag=self._ctrl_tag)
                    r_ctrl.append(d)

            # Execute work method; the work method may send data back to the master
            # as a serialized control message containing two elements, e.g.,
            # self.intercomm.isend(['foo', str(self.rank)],
            #                      dest=0, tag=self._ctrl_tag)
            if running:
                self.do_work()
                self.steps += 1
                self.pbar.update()
                self.log_info('execution step: %s' % self.steps)

            # Leave loop if maximum number of steps has been reached:
            if self.steps >= self.max_steps:
                running = False
                self.log_info('maximum steps reached')
                break

        #self.post_run()
        self.catch_exception_run(self.post_run)
        if not self.post_run_complete:
            self._finalize()
Esempio n. 18
0
    def spawn(self, part_map):
        """
        Spawn MPI processes for and execute each of the managed targets.

        Parameters
        ----------
        part_map : dict
            Maps GPU ID to list of target MPI ranks.
        """

        if self._is_parent:

            # The number of GPUs over which the targets are partitioned may not
            # exceed the actual number of supported devices:
            n_part_gpus = len(part_map.keys())
            n_avail_gpus = 0
            drv.init()
            for i in xrange(drv.Device.count()):

                # MPS requires Tesla/Quadro GPUs with compute capability 3.5 or greater:
                if mps_avail:
                    d = drv.Device(i)
                    if d.compute_capability() >= (3, 5) and \
                       re.search('Tesla|Quadro', d.name()):
                        n_avail_gpus += 1
                else:
                    n_avail_gpus += 1
            if n_part_gpus > n_avail_gpus:
                raise RuntimeError('partition size (%s) exceeds '
                                   'number of available GPUs (%s)' % \
                                   (n_part_gpus, n_avail_gpus))

            # Start MPS control daemons (this assumes that the available GPUs
            # are numbered consecutively from 0 onwards - as are the elements of
            # part_map.keys()):
            if self._mps_man:
                self._mps_man.start()
                self.log_info('starting MPS')

            # Find the path to the mpi_backend.py script (which should be in the
            # same directory as this module:
            import neurokernel.mpi
            parent_dir = os.path.dirname(neurokernel.mpi.__file__)
            mpi_backend_path = os.path.join(parent_dir, 'mpi_backend.py')

            # Check that the union ranks in the partition correspond exactly to
            # those of the targets added to the manager:
            n_targets = len(self._targets.keys())
            if set(self._targets.keys()) != \
               set([t for t in itertools.chain.from_iterable(part_map.values())]):
                raise ValueError('partition must contain all target ranks')

            # Invert mapping of GPUs to MPI ranks:
            rank_to_gpu_map = {
                rank: gpu
                for gpu in part_map.keys() for rank in part_map[gpu]
            }

            # Set MPS pipe directory:
            info = MPI.Info.Create()
            if self._mps_man:
                mps_dir = self._mps_man.get_mps_dir(
                    self._mps_man.get_mps_ctrl_proc())
                info.Set('env', 'CUDA_MPS_PIPE_DIRECTORY=%s' % mps_dir)

            # Spawn processes:
            self._intercomm = MPI.COMM_SELF.Spawn(sys.executable,
                                                  args=[mpi_backend_path],
                                                  maxprocs=n_targets,
                                                  info=info)

            # First, transmit twiggy logging emitters to spawned processes so
            # that they can configure their logging facilities:
            for i in self._targets.keys():
                self._intercomm.send(twiggy.emitters, i)

            # Next, serialize the routing table ONCE and then transmit it to all
            # of the child nodes:
            self._intercomm.bcast(self.routing_table, root=MPI.ROOT)

            # Transmit class to instantiate, globals required by the class, and
            # the constructor arguments; the backend will wait to receive
            # them and then start running the targets on the appropriate nodes.
            req = MPI.Request()
            r_list = []
            for i in self._targets.keys():
                target_globals = all_global_vars(self._targets[i])

                # Serializing atexit with dill appears to fail in virtualenvs
                # sometimes if atexit._exithandlers contains an unserializable function:
                if 'atexit' in target_globals:
                    del target_globals['atexit']
                data = (self._targets[i], target_globals, self._kwargs[i])
                r_list.append(self._intercomm.isend(data, i))

                # Need to clobber data to prevent all_global_vars from
                # including it in its output:
                del data
            req.Waitall(r_list)
Esempio n. 19
0
def partial_replication_logistic_regression(n_procs, n_samples, n_features,
                                            input_dir, n_stragglers,
                                            n_partitions, is_real_data,
                                            params):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    rounds = params[0]
    n_workers = n_procs - 1

    if (n_workers % (n_stragglers + 1)):
        print("Error: n_workers must be multiple of n_stragglers+1!")
        sys.exit(0)

    rows_per_worker = n_samples // ((n_partitions - n_stragglers) * n_workers
                                    )  # per partition num of samples
    n_groups = n_workers / (n_stragglers + 1)
    n_separate = n_partitions - n_stragglers - 1
    sep_lim = n_separate * rows_per_worker

    # Loading the data
    if (rank):

        if not is_real_data:

            y = load_data(input_dir + "label.dat")
            X_current = np.zeros([n_partitions * rows_per_worker, n_features])
            y_current = np.zeros(n_partitions * rows_per_worker)

            for i in range(n_separate):
                idx = i + n_separate * (rank - 1)
                X_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker, :] = load_data(input_dir +
                                                          str(idx + 1) +
                                                          ".dat")
                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

            for i in range(n_separate, n_partitions):
                a = (rank - 1) / (n_stragglers + 1)  # index of group
                b = i - n_separate  # position inside the group
                idx = n_separate * n_workers + a * (n_stragglers + 1) + b

                X_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker, :] = load_data(input_dir +
                                                          str(idx + 1) +
                                                          ".dat")
                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

        else:

            y = load_data(input_dir + "label.dat")

            y_current = np.zeros(n_partitions * rows_per_worker)

            for i in range(n_separate):
                idx = i + n_separate * (rank - 1)
                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

                if i == 0:
                    X_current = load_sparse_csr(input_dir + str(idx + 1))
                else:
                    X_temp = load_sparse_csr(input_dir + str(idx + 1))
                    X_current = sps.vstack((X_current, X_temp))

            for i in range(n_separate, n_partitions):
                a = (rank - 1) / (n_stragglers + 1)  # index of group
                b = i - n_separate  # position inside the group
                idx = n_separate * n_workers + a * (n_stragglers + 1) + b

                y_current[i * rows_per_worker:(i + 1) *
                          rows_per_worker] = y[idx *
                                               rows_per_worker:(idx + 1) *
                                               rows_per_worker]

                X_temp = load_sparse_csr(input_dir + str(idx + 1))
                X_current = sps.vstack((X_current, X_temp))

            X_current = X_current.tocsr()

    # Initializing relevant variables
    beta = np.zeros(n_features)
    if (rank):

        predy = X_current.dot(beta)
        g_firstpart = -X_current.T.dot(
            np.divide(y_current,
                      np.exp(np.multiply(predy, y_current)) + 1))
        g_secondpart = -X_current.T.dot(
            np.divide(y_current,
                      np.exp(np.multiply(predy, y_current)) + 1))

        send_req1 = MPI.Request()
        send_req2 = MPI.Request()
        recv_reqs = []

    else:

        print('Stragglers are allowed to be atmost %.2f times slower' %
              (n_partitions * 1.0 / (n_partitions - n_stragglers - 1)))

        msgBuffers_firstparts = [
            np.zeros(n_features) for i in range(n_procs - 1)
        ]
        msgBuffers_secondparts = [
            np.zeros(n_features) for i in range(n_procs - 1)
        ]

        g = np.zeros(n_features)
        betaset = np.zeros((rounds, n_features))
        timeset = np.zeros(rounds)
        worker_timeset = np.zeros((rounds, n_procs - 1))

        request_set = []
        recv_reqs = []
        send_set = []

        cnt_groups = 0
        cnt_firstpart = 0
        completed_groups = np.ndarray(n_groups, dtype=bool)
        completed_workers = np.ndarray(n_workers, dtype=bool)
        completed_firstparts = np.ndarray(n_workers, dtype=bool)

        status = MPI.Status()

        eta0 = params[2]  # ----- learning rate
        alpha = params[1]  # --- coefficient of l2 regularization
        utemp = np.zeros(n_features)  # for accelerated gradient descent

    # Posting all Irecv requests for master and workers
    if (rank):

        for i in range(rounds):
            req = comm.Irecv([beta, MPI.DOUBLE], source=0, tag=i)
            recv_reqs.append(req)
    else:

        for i in range(rounds):
            recv_reqs = []
            for j in range(1, n_procs):
                req1 = comm.Irecv([msgBuffers_firstparts[j - 1], MPI.DOUBLE],
                                  source=j,
                                  tag=2 * rounds + i)
                recv_reqs.append(req1)
                req2 = comm.Irecv([msgBuffers_secondparts[j - 1], MPI.DOUBLE],
                                  source=j,
                                  tag=i)
                recv_reqs.append(req2)

            request_set.append(recv_reqs)

    #######################################################################################################################
    comm.Barrier()
    if rank == 0:
        print("---- Starting Partial Replication Iterations for " +
              str(n_stragglers) + " stragglers ----")
        orig_start_time = time.time()

    for i in range(rounds):
        if rank == 0:

            if (i % 10 == 0):
                print("\t >>> At Iteration %d" % (i))

            send_set[:] = []
            g[:] = 0

            cnt_firstpart = 0
            completed_firstparts[:] = False

            completed_groups[:] = False
            cnt_groups = 0
            completed_workers[:] = False

            start_time = time.time()

            for l in range(1, n_procs):
                sreq = comm.Isend([beta, MPI.DOUBLE], dest=l, tag=i)
                send_set.append(sreq)

            while cnt_groups < n_groups or cnt_firstpart < n_workers:

                req_done = MPI.Request.Waitany(request_set[i], status)
                src = status.Get_source()
                tag = status.Get_tag()
                worker_timeset[i, src - 1] = time.time() - start_time
                request_set[i].pop(req_done)

                if tag == i:

                    completed_workers[src - 1] = True
                    groupid = (src - 1) / (n_stragglers + 1)

                    if not completed_groups[groupid]:
                        completed_groups[groupid] = True
                        g += msgBuffers_secondparts[src - 1]
                        cnt_groups += 1

                elif tag == 2 * rounds + i:
                    g += msgBuffers_firstparts[src - 1]
                    completed_firstparts[src - 1] = True
                    cnt_firstpart += 1

            grad_multiplier = eta0[i] / n_samples
            # ---- update step for gradient descent
            # np.subtract((1-2*alpha*eta0[i])*beta , grad_multiplier*g, out=beta)

            # ---- updates for accelerated gradient descent
            theta = 2.0 / (i + 2.0)
            ytemp = (1 - theta) * beta + theta * utemp
            betatemp = ytemp - grad_multiplier * g - (2 * alpha *
                                                      eta0[i]) * beta
            utemp = beta + (betatemp - beta) * (1 / theta)
            beta[:] = betatemp

            timeset[i] = time.time() - start_time

            betaset[i, :] = beta
            ind_set = [
                l for l in range(1, n_procs) if not completed_workers[l - 1]
            ]
            for l in ind_set:
                worker_timeset[i, l - 1] = -1

        else:

            recv_reqs[i].Wait()

            sendTestBuf = send_req1.test()
            if not sendTestBuf[0]:
                send_req1.Cancel()

            sendTestBuf = send_req2.test()
            if not sendTestBuf[0]:
                send_req2.Cancel()

            predy = X_current[0:sep_lim, :].dot(beta)
            g_firstpart = X_current[0:sep_lim, :].T.dot(
                np.divide(y_current[0:sep_lim],
                          np.exp(np.multiply(predy, y_current[0:sep_lim])) +
                          1))
            g_firstpart *= -1
            send_req1 = comm.Isend([g_firstpart, MPI.DOUBLE],
                                   dest=0,
                                   tag=2 * rounds + i)

            predy = X_current[sep_lim:, :].dot(beta)
            g_secondpart = X_current[sep_lim:, :].T.dot(
                np.divide(y_current[sep_lim:],
                          np.exp(np.multiply(predy, y_current[sep_lim:])) + 1))
            g_secondpart *= -1
            send_req2 = comm.Isend([g_secondpart, MPI.DOUBLE], dest=0, tag=i)

    ######################################################################################################################
    comm.Barrier()
    if rank == 0:
        elapsed_time = time.time() - orig_start_time
        print("Total Time Elapsed: %.3f" % (elapsed_time))
        # Load all training data
        if not is_real_data:
            X_train = load_data(input_dir + "1.dat")
            for j in range(2, n_procs - 1):
                X_temp = load_data(input_dir + str(j) + ".dat")
                X_train = np.vstack((X_train, X_temp))
        else:
            X_train = load_sparse_csr(input_dir + "1")
            for j in range(2, n_procs - 1):
                X_temp = load_sparse_csr(input_dir + str(j))
                X_train = sps.vstack((X_train, X_temp))

        y_train = load_data(input_dir + "label.dat")
        y_train = y_train[0:X_train.shape[0]]

        # Load all testing data
        y_test = load_data(input_dir + "label_test.dat")
        if not is_real_data:
            X_test = load_data(input_dir + "test_data.dat")
        else:
            X_test = load_sparse_csr(input_dir + "test_data")

        n_train = X_train.shape[0]
        n_test = X_test.shape[0]

        training_loss = np.zeros(rounds)
        testing_loss = np.zeros(rounds)
        auc_loss = np.zeros(rounds)

        from sklearn.metrics import roc_curve, auc

        for i in range(rounds):
            beta = np.squeeze(betaset[i, :])
            predy_train = X_train.dot(beta)
            predy_test = X_test.dot(beta)
            training_loss[i] = calculate_loss(y_train, predy_train, n_train)
            testing_loss[i] = calculate_loss(y_test, predy_test, n_test)
            fpr, tpr, thresholds = roc_curve(y_test, predy_test, pos_label=1)
            auc_loss[i] = auc(fpr, tpr)
            print(
                "Iteration %d: Train Loss = %5.3f, Test Loss = %5.3f, AUC = %5.3f, Total time taken =%5.3f"
                % (i, training_loss[i], testing_loss[i], auc_loss[i],
                   timeset[i]))

        output_dir = input_dir + "results/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        save_vector(
            training_loss,
            output_dir + "partialreplication_%d_%d_training_loss.dat" %
            (n_stragglers, n_partitions))
        save_vector(
            testing_loss,
            output_dir + "partialreplication_%d_%d_testing_loss.dat" %
            (n_stragglers, n_partitions))
        save_vector(
            auc_loss, output_dir + "partialreplication_%d_%d_auc.dat" %
            (n_stragglers, n_partitions))
        save_vector(
            timeset, output_dir + "partialreplication_%d_%d_timeset.dat" %
            (n_stragglers, n_partitions))
        save_matrix(
            worker_timeset,
            output_dir + "partialreplication_%d_%d_worker_timeset.dat" %
            (n_stragglers, n_partitions))
        print(">>> Done")

    comm.Barrier()