def addManagedData(self, data): """ Set the data of the network, used for managing multiple input data sources for training :param data: training data and labels specified as dictionary :return: None """ if not isinstance(data, dict): raise ValueError("Error: expected dictionary for data!") for key in data: # check sizes if data[key].shape[0] != self.numTrainSamplesMB: raise ValueError("Number of samples must be the same as number of labels.") if self.getNumMacroBatches() > 1: if self.cfgParams.para_load is True: setattr(self, key+'DB', sharedmem.copy(data[key][0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()])) setattr(self, key+'DBlast', sharedmem.copy(self.alignData(data[key][(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=data[key]))) else: # save memory, we do not need extra sharedmem setattr(self, key+'DB', data[key][0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]) setattr(self, key+'DBlast', self.alignData(data[key][(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=data[key])) self.managedVar.append(key) else: if self.cfgParams.para_load is True: setattr(self, key+'DB', sharedmem.copy(self.alignData(data[key]))) else: # save memory, we do not need extra sharedmem setattr(self, key + 'DB', self.alignData(data[key])) self.trainingVar.append(key) # shared variable already exists? if hasattr(self, key): print("Reusing shared variables!") if self.trainSize > self.getGPUMemAligned(): print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned())) # load first macro batch idx = self.getNumSamplesPerMacroBatch() self.replaceTrainingData(0, idx) else: print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned())) self.replaceTrainingData(0, self.train_data_xDB.shape[0]) else: # load shared data if self.trainSize > self.getGPUMemAligned(): print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned())) # load first macro batch idx = self.getNumSamplesPerMacroBatch() setattr(self, key, theano.shared(getattr(self, key+'DB')[:idx], name=key, borrow=True)) else: print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned())) setattr(self, key, theano.shared(getattr(self, key+'DB'), name=key, borrow=True))
def mapper(f, pars, *argv, **kwarg): # create a shared object from X x_s = [sharedmem.copy(x) for x in argv] # parallel process over shared object t0 = time() with mp.Pool(3) as pool: res = pool.starmap_async(f, [(p, *x_s, *list(kwarg.values())) for p in pars]) a = res.get() pool.close() pool.join() return a
def shm_gaukernop_at(x, xp, y, data): y = np.ascontiguousarray(y) x_shm = shm.copy(x) xp_shm = shm.copy(xp) y_shm = shm.copy(y) data_shm = shm.copy(data) nthread = int(os.environ["TENSIGA_NUM_THREADS"]) result = shm.empty(y.shape, np.float) with shm.MapReduce(np=nthread) as pool: def row(k): d = x_shm[k,:] - xp_shm norm = np.sqrt(np.sum(d**2, axis=1)) return k, ((data_shm[0]**2) * np.exp(-(norm/(data_shm[1]*data_shm[2]))**2)) @ y_shm def reduce(k, coeff): result[k] = coeff r = pool.map(row, np.arange(x_shm.shape[0]), reduce=reduce) return result
def __make_np_arrays_sharable(self): """ Replaces all numpy array object variables with dimension > 0 with a sharedmem array, which should have the same behaviour / properties as the numpy array """ varDict = self.__dict__ for key, var in varDict.items(): if type(var) is np.ndarray: if not key in self.exclude: try: varDict[key] = sharedmem.copy(var) except AttributeError: share_var = sharedmem.empty(1, type(var)) share_var[0] = var varDict[key] = share_var
def MakeEigenModes(powerspec, templatecorrfunc): """ create fitting eigenstates for a given powerspectrum see Slosar paper. basically getting out the poles of the powerspectrum, then evaluate them on the grid given in templatecorrfunc. this is used in bootstrap (for per sample) usually, the eigenmodes are ordered first by the correlation functions, (QQ QF, FF) then by the multipole order (0, 2, 4) """ dummy = templatecorrfunc eigenmodes = [] N = len(dummy.compress()) annotation = [] for i in range(len(dummy)): for order in [0, 2, 4]: annotation.append((i, order)) eigenmode = dummy.copy() eigenmode.uncompress(numpy.zeros(N)) eigenmode[i].xi = sharedmem.copy(eigenmode[i].xi) eigenmodes.append(eigenmode) with sharedmem.MapReduce() as pool: def work(j): i, order = annotation[j] c = numpy.zeros(5) c[order] = 1 # watch out the eigenmode is wikipedia # no negative phase on 2nd order pole! eigenmode = eigenmodes[j] eigenmode[i].xi[...] = \ powerspec.pole(eigenmode[i].r, order=order)[:, None] \ * legval(eigenmode[i].mu, c)[None, :] pool.map(work, range(len(annotation))) # the eigenmodes are ordered first by the correlation functions, (QQ QF, FF) # then by the multipole order (0, 2, 4) # for j in range(len(annotation)): i, order =annotation[j] eigenmode = eigenmodes[j] eigenmode[i].xi = numpy.array(eigenmode[i].xi, copy=True) return EigenModes(eigenmodes)
def shm_chunk_gaukernop_at(x, xp, y, data): nthread = int(os.environ["TENSIGA_NUM_THREADS"]) chunk_size = x.shape[0]//nthread last_chunk_size = chunk_size + x.shape[0] % nthread indices_start = [ chunk_size*k for k in range(nthread-1) ] indices_start.append(chunk_size*(nthread-1)) indices_start = shm.copy(np.array(indices_start)) indices_stop = [ chunk_size*(k+1) for k in range(nthread-1) ] indices_stop.append(chunk_size*(nthread-1) + last_chunk_size) indices_stop = shm.copy(np.array(indices_stop)) y = np.ascontiguousarray(y) x = shm.copy(x) xp = shm.copy(xp) y = shm.copy(y) data = shm.copy(data) result = shm.empty((y.shape[0],1), np.float) with shm.MapReduce(np=nthread) as pool: @jit(fastmath=True) def row(k): xslice = x[slice(indices_start[k], indices_stop[k]),:] res = np.empty((xslice.shape[0],1)) for l in range(xslice.shape[0]): d = xslice[l,:] - xp norm = np.sqrt(np.sum(d**2, axis=1)) res[l] = ((data[0]**2) * np.exp(-(norm/(data[1]*data[2]))**2)) @ y return k, res def reduce(k, coeff): result[slice(indices_start[k], indices_stop[k])] = coeff r = pool.map(row, np.arange(nthread), reduce=reduce) return result
def refineFit(distMat, sample_names, start_s, mean0, mean1, max_move, min_move, slope=2, no_local=False, num_processes=1): """Try to refine a fit by maximising a network score based on transitivity and density. Iteratively move the decision boundary to do this, using starting point from existing model. Args: distMat (numpy.array) n x 2 array of core and accessory distances for n samples sample_names (list) List of query sequence labels start_s (float) Point along line to start search mean0 (numpy.array) Start point to define search line mean1 (numpy.array) End point to define search line max_move (float) Maximum distance to move away from start point min_move (float) Minimum distance to move away from start point slope (int) Set to 0 for a vertical line, 1 for a horizontal line, or 2 to use a slope no_local (bool) Turn off the local optimisation step. Quicker, but may be less well refined. num_processes (int) Number of threads to use in the global optimisation step. (default = 1) Returns: start_point (tuple) (x, y) co-ordinates of starting point optimal_x (float) x-coordinate of refined fit optimal_y (float) y-coordinate of refined fit """ sys.stderr.write("Initial boundary based network construction\n") start_point = transformLine(start_s, mean0, mean1) sys.stderr.write("Decision boundary starts at (" + "{:.2f}".format(start_point[0]) + "," + "{:.2f}".format(start_point[1]) + ")\n") # Boundary is left of line normal to this point and first line gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) # ALTERNATIVE - use a single network # Move boundary along in steps, and find those samples which have changed # Use remove_edges/add_edges with index k lookup (n total) to find sample IDs # https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix # i = n - 2 - int(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5) # j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2 # Optimize boundary - grid search for global minimum sys.stderr.write("Trying to optimise score globally\n") global_grid_resolution = 40 # Seems to work shared_dists = sharedmem.copy(distMat) s_range = np.linspace(-min_move, max_move, num=global_grid_resolution) with sharedmem.MapReduce(np=num_processes) as pool: global_s = pool.map( partial(newNetwork, sample_names=sample_names, distMat=shared_dists, start_point=start_point, mean1=mean1, gradient=gradient, slope=slope), s_range) # Local optimisation around global optimum min_idx = np.argmin(np.array(global_s)) if min_idx > 0 and min_idx < len(s_range) - 1 and not no_local: sys.stderr.write("Trying to optimise score locally\n") local_s = scipy.optimize.minimize_scalar( newNetwork, bounds=[s_range[min_idx - 1], s_range[min_idx + 1]], method='Bounded', options={'disp': True}, args=(sample_names, distMat, start_point, mean1, gradient, slope)) optimised_s = local_s.x else: optimised_s = s_range[min_idx] optimised_coor = transformLine(optimised_s, start_point, mean1) if slope == 2: optimal_x, optimal_y = decisionBoundary(optimised_coor, gradient) else: optimal_x = optimised_coor[0] optimal_y = optimised_coor[1] if optimal_x < 0 or optimal_y < 0: raise RuntimeError( "Optimisation failed: produced a boundary outside of allowed range\n" ) return start_point, optimal_x, optimal_y
plt.imshow(np.hstack((img, img2))) ext = file_name.split('.')[-1] if ext != 'bz2': main_img = imread(file_name, mode="L") img_name = file_name.replace( '.' + ext, '_{}_{}.{}'.format(scale, sigma, ext)) test_one(img_name, main_img, scale, sigma) else: import sharedmem main_imgs = read_json_bz2(file_name) main_imgs = main_imgs.astype('float') print main_imgs.dtype shrd_imgs = sharedmem.copy(main_imgs) wd = ht = int(main_imgs.shape[1] ** .5) batch_sz = 100 h = hpy() deformer = Deformer(shrd_imgs, batch_sz, (ht, wd), scale, sigma, 1) print deformer for ibatch in deformer: print 'Processing Imgs : ', ibatch * batch_sz, '-', (ibatch+1 )* batch_sz, print shrd_imgs[ibatch*batch_sz].max(), shrd_imgs[ibatch*batch_sz].min() for i in range(batch_sz): iimg = ibatch * batch_sz + i img_name = 'tmp/' + file_name.replace( '.'+ext, '_{}.{}'.format(iimg, 'tif')) composite = np.vstack((main_imgs[iimg].reshape(ht, wd), shrd_imgs[iimg].reshape(ht, wd)))
def lasso_binary_classification(image_list, diagnosis_list, output_directory, existing_gram_matrix=None, mask_zeros=True, scale_data=False, positive=False, outer_folds=10, inner_folds=10, n_threads=10, alphas=np.arange(0.1, 1.1, 0.1), save_gram_matrix=False, save_subject_classification=False, save_weights=True, save_features_image=True): results = dict() dx_filter = np.unique(diagnosis_list) print 'Loading ' + str(len(image_list)) + ' subjects' x0, orig_shape, data_mask = load_data(image_list, mask=mask_zeros) print 'Subjects loaded' if scale_data: x_all = scale(x0) else: x_all = x0 # if existing_gram_matrix is not None: # gram_matrix = existing_gram_matrix # if (gram_matrix.shape[0] != gram_matrix.shape[1]) | (gram_matrix.shape[0] != len(image_list)): # raise ValueError('The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.') # else: # print 'Calculating Gram matrix' # gram_matrix = gram_matrix_linear(x_all) # print 'Gram matrix calculated' # if save_gram_matrix: # np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix) # BaseManager.register('ndarray', type(x_all)) # manager = BaseManager() # manager.start() # shared_x = manager.ndarray(x_all.shape, buffer=x_all) shared_x = sharedmem.copy(x_all) x_all = None gc.collect() for i in range(len(dx_filter)): for j in range(i + 1, len(dx_filter)): dx1 = dx_filter[i] dx2 = dx_filter[j] ind1 = [] ind2 = [] for k in range(len(diagnosis_list)): if diagnosis_list[k] == dx1: ind1.append(k) if diagnosis_list[k] == dx2: ind2.append(k) indices = ind1 + ind2 indices = np.array(indices) current_subjects = [image_list[k] for k in indices] current_diagnosis = [diagnosis_list[k] for k in indices] # x = x_all[indices, :] y = np.array([0] * len(ind1) + [1] * len(ind2)) # gm = gram_matrix[indices, :][:, indices] classification_str = dx1 + '_vs_' + dx2 + ('_positive' if positive else '') print 'Running ' + dx1 + ' vs ' + dx2 + ' classification' y_hat, coefficients, intersect, alpha = nested_folds( shared_x, indices, y, alphas, positive=positive, outer_folds=outer_folds, inner_folds=inner_folds, n_threads=n_threads) evaluation = evaluate_prediction(y, y_hat) print '\nTrue positive %0.2f' % len(evaluation['predictions'][0]) print 'True negative %0.2f' % len(evaluation['predictions'][1]) print 'False positive %0.2f' % len(evaluation['predictions'][2]) print 'False negative %0.2f' % len(evaluation['predictions'][3]) print 'Accuracy %0.2f' % evaluation['accuracy'] print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy'] print 'Sensitivity %0.2f' % evaluation['sensitivity'] print 'Specificity %0.2f' % evaluation['specificity'] print 'Positive predictive value %0.2f' % evaluation['ppv'] print 'Negative predictive value %0.2f \n' % evaluation['npv'] if save_weights or save_features_image: weights_orig = revert_mask(coefficients, data_mask, orig_shape) if save_weights: np.save( join(output_directory, classification_str + '__intersect'), intersect) np.save( join(output_directory, classification_str + '__weights'), weights_orig) if save_features_image: weights_to_nifti( weights_orig, image_list[0], join(output_directory, classification_str + '__features_image.nii')) if save_subject_classification: save_subjects_prediction( current_subjects, current_diagnosis, y, y_hat, join(output_directory, classification_str + '__subjects.csv')) results[(dx1, dx2)] = evaluate_prediction(y, y_hat) results_to_csv( results, dx_filter, join(output_directory, 'resume' + ('_positive' if positive else '') + '.csv'))
def svm_binary_classification(input_image_atlas, subjects_visits_tsv, image_list, diagnosis_list, output_directory, kernel_function=None, existing_gram_matrix=None, mask_zeros=True, scale_data=False, balanced=False, outer_folds=10, inner_folds=10, n_threads=10, c_range=np.logspace(-10, 2, 1000), save_gram_matrix=False, save_subject_classification=False, save_dual_coefficients=False, scaler=None, data_mask=None, save_original_weights=False, save_features_image=True): if (kernel_function is None and existing_gram_matrix is None) | ( kernel_function is not None and existing_gram_matrix is not None): raise ValueError( 'Kernel_function and existing_gram_matrix are mutually exclusive parameters.' ) results = dict() dx_filter = np.unique(diagnosis_list) print 'Loading ' + str(len(image_list)) + ' subjects' x0 = load_data(image_list, subjects_visits_tsv) print 'Subjects loaded' if scale_data: x_all = scale(x0) else: x_all = x0 if existing_gram_matrix is None: if kernel_function is not None: print 'Calculating Gram matrix' gram_matrix = kernel_function(x_all) print 'Gram matrix calculated' else: raise ValueError( 'If a Gram matrix is not provided a function to calculate it (kernel_function) is a required input.' ) else: gram_matrix = existing_gram_matrix if (gram_matrix.shape[0] != gram_matrix.shape[1]) | ( gram_matrix.shape[0] != len(image_list)): raise ValueError( 'The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.' ) if save_gram_matrix: np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix) shared_x = sharedmem.copy(x_all) x_all = None gc.collect() for i in range(len(dx_filter)): for j in range(i + 1, len(dx_filter)): print j dx1 = dx_filter[i] dx2 = dx_filter[j] ind1 = [] ind2 = [] for k in range(len(diagnosis_list)): if diagnosis_list[k] == dx1: ind1.append(k) if diagnosis_list[k] == dx2: ind2.append(k) indices = ind1 + ind2 current_subjects = [image_list[k] for k in indices] current_diagnosis = [diagnosis_list[k] for k in indices] y = np.array([0] * len(ind1) + [1] * len(ind2)) gm = gram_matrix[indices, :][:, indices] classification_str = dx1 + '_vs_' + dx2 + ('_balanced' if balanced else '_not_balanced') print 'Running ' + dx1 + ' vs ' + dx2 + ' classification' y_hat, dual_coefficients, sv_indices, intersect, c, auc = cv_svm( gm, shared_x, np.array(indices), y, c_range, balanced=balanced, outer_folds=outer_folds, inner_folds=inner_folds, n_threads=n_threads) evaluation = evaluate_prediction(y, y_hat) evaluation['auc'] = auc print '\nTrue positive %0.2f' % len(evaluation['predictions'][0]) print 'True negative %0.2f' % len(evaluation['predictions'][1]) print 'False positive %0.2f' % len(evaluation['predictions'][2]) print 'False negative %0.2f' % len(evaluation['predictions'][3]) print 'AUC %0.2f' % auc print 'Accuracy %0.2f' % evaluation['accuracy'] print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy'] print 'Sensitivity %0.2f' % evaluation['sensitivity'] print 'Specificity %0.2f' % evaluation['specificity'] print 'Positive predictive value %0.2f' % evaluation['ppv'] print 'Negative predictive value %0.2f \n' % evaluation['npv'] if save_dual_coefficients: np.save( join(output_directory, classification_str + '__dual_coefficients'), dual_coefficients[0]) np.save( join(output_directory, classification_str + '__sv_indices'), sv_indices) np.save( join(output_directory, classification_str + '__intersect'), intersect) if save_original_weights or save_features_image: weights_orig = features_weights(current_subjects, dual_coefficients[0], sv_indices, scaler, data_mask) if save_original_weights: np.save( join(output_directory, classification_str + '__weights'), weights_orig) if save_features_image: output_image = weights_to_nifti(input_image_atlas, weights_orig) output_image.to_filename( join(output_directory, classification_str + '__weights.nii')) if save_subject_classification: save_subjects_prediction( current_subjects, current_diagnosis, y, y_hat, join(output_directory, classification_str + '__subjects.tsv')) results[(dx1, dx2)] = evaluation # evaluate_prediction(y, y_hat) results_to_tsv( results, dx_filter, join( output_directory, 'resume' + ('_balanced' if balanced else '_not_balanced') + '.tsv')) shared_x = None gc.collect()
def setData(self, train_data, train_y, val_data, val_y, max_train_size=0): """ Set the data of the network, assuming train size << val size :param train_data: training data :param train_y: training labels :param val_data: validation data :param val_y: validation labels :param max_train_size: optional if training data has additional large chunk :return: None """ # check sizes if (train_data.shape[0] != train_y.shape[0]) or (val_data.shape[0] != val_y.shape[0]): raise ValueError("Number of samples must be the same as number of labels.") # Check if the train_y is the image self.trainSize = max(train_data.nbytes, train_y.nbytes, max_train_size) / 1024. / 1024. self.numTrainSamplesMB = train_data.shape[0] self.numTrainSamples = self.numTrainSamplesMB self.numValSamples = val_data.shape[0] self.sampleSize = self.trainSize / self.numTrainSamplesMB # at least one minibatch per macro assert self.memorySize > self.sampleSize*self.cfgParams.batch_size, "{} > {}".format(self.memorySize, self.sampleSize*self.cfgParams.batch_size) # shrink macro batch size to smallest possible if self.getNumMacroBatches() == 1: self.memorySize = self.sampleSize * numpy.ceil(self.numTrainSamplesMB/float(self.cfgParams.batch_size)) * self.cfgParams.batch_size # keep backup of original data # pad last macro batch separately to save memory if self.getNumMacroBatches() > 1: if self.cfgParams.para_load is True: self.train_data_xDB = sharedmem.copy(train_data[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]) self.train_data_xDBlast = sharedmem.copy(self.alignData(train_data[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_data)) self.train_data_yDB = sharedmem.copy(train_y[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]) self.train_data_yDBlast = sharedmem.copy(self.alignData(train_y[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_y)) else: # save memory, we do not need extra sharedmem self.train_data_xDB = train_data[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()] self.train_data_xDBlast = self.alignData(train_data[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_data) self.train_data_yDB = train_y[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()] self.train_data_yDBlast = self.alignData(train_y[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_y) self.managedVar.append('train_data_x') self.managedVar.append('train_data_y') else: if self.cfgParams.para_load is True: self.train_data_xDB = sharedmem.copy(self.alignData(train_data)) self.train_data_yDB = sharedmem.copy(self.alignData(train_y)) else: # save memory, we do not need extra sharedmem self.train_data_xDB = self.alignData(train_data) self.train_data_yDB = self.alignData(train_y) self.trainingVar.append('train_data_x') self.trainingVar.append('train_data_y') # no need to cache validation data self.val_data_xDB = val_data self.val_data_yDB = val_y print("Train size: {}MB, Memory available: {}MB, sample size: {}MB, aligned memory: {}MB".format( self.trainSize, self.memorySize, self.sampleSize, self.getGPUMemAligned())) print("{} train samples, {} val samples, batch size {}".format( train_data.shape[0], val_data.shape[0], self.cfgParams.batch_size)) print("{} macro batches, {} mini batches per macro, {} full mini batches total".format( self.getNumMacroBatches(), self.getNumMiniBatchesPerMacroBatch(), self.getNumMiniBatches())) print("{} data chunks, {} train samples total".format(self.numChunks, self.numTrainSamples)) # shared variable already exists? if hasattr(self, 'train_data_x'): print("Reusing shared variables!") if self.trainSize > self.getGPUMemAligned(): print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned())) # load first macro batch idx = self.getNumSamplesPerMacroBatch() self.replaceTrainingData(0, idx) self.replaceValData(self.val_data_xDB, self.val_data_yDB) else: print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned())) self.replaceTrainingData(0, self.train_data_xDB.shape[0]) self.replaceValData(self.val_data_xDB, self.val_data_yDB) else: # load shared data if self.trainSize > self.getGPUMemAligned(): print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned())) # load first macro batch idx = self.getNumSamplesPerMacroBatch() self.train_data_x = theano.shared(self.train_data_xDB[:idx], name='train_data_x', borrow=True) self.train_data_y = theano.shared(self.train_data_yDB[:idx], name='train_data_y', borrow=True) self.val_data_x = theano.shared(self.val_data_xDB, name='val_data_x', borrow=True) self.val_data_y = theano.shared(self.val_data_yDB, name='val_data_y', borrow=True) else: print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned())) self.train_data_x = theano.shared(self.train_data_xDB, name='train_data_x', borrow=True) self.train_data_y = theano.shared(self.train_data_yDB, name='train_data_y', borrow=True) self.val_data_x = theano.shared(self.val_data_xDB, name='val_data_x', borrow=True) self.val_data_y = theano.shared(self.val_data_yDB, name='val_data_y', borrow=True)
def classall(infile,components,rotation,outfile='classout.hdf5',npoints=16384,nn=16384,lim=[12,6],end=False): sys.path.append('/n/ghernquist/kchua/Orbit/201-code-C-Mar2016/Python') import taxon import solve def calcclass(i,t,x,v,nn): #taxon(t,x,v,n,jsub,jdim,jcla,jcl,jpan,jlin,jcom,arch) tt=np.asfortranarray(t[i]) xx=np.asfortranarray(x[i].transpose()) vv=np.asfortranarray(v[i].transpose())*Myr if nn==32768: out=taxon32768.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test') elif nn==16384: #out=taxon.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test') out=taxon.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test') elif nn==8192: out=taxon8192.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test') return out ## N = no. of particles in each bin ## nn = no. of points to be used in classification with tables.open_file('shape.hdf5','r') as rotfile: rotmat=rotfile.root.rotmat[rotation] with tables.open_file(infile,'r') as u: #t=u.root.t[:]/Myr npart=u.root.x.shape[0]/(npoints)/6 print 'analyzing orbits for',infile classout=np.zeros((npart,2)) x=u.root.x[:].reshape(npart,npoints,6) t=u.root.t[:].reshape(npart,npoints)/Myr try: Ncom=u.root.NumComponents[:] except tables.NoSuchNodeError: Ncom=components assert Ncom in [1,2] if end: print 'using only ',end,' points' t=t[:end] x=x[:,:end,:] else: print 'using all points' interval=npoints/nn print 'interval = ', interval with tables.open_file(outfile,'w') as file: file.create_carray("/","classorb",tables.Int32Col(),(npart,2)) file.create_carray("/","classification",tables.Int32Col(),(npart,)) distout=file.create_carray("/","avgdist",tables.Float64Col(),(npart,2)) file.create_carray("/","totE",tables.Float64Col(),(npart,2)) file.create_carray("/","Ncomponents",tables.Int32Atom(),(1,)) file.root.Ncomponents[0]=Ncom v=sharedmem.copy(x[:,:,3:]) x=sharedmem.copy(x[:,:,:3]) distout[:,0]=np.sqrt((x[:,0]**2).sum(axis=1)) with sharedmem.MapReduce() as pool: def avgr(i,xx,vv): d=np.sqrt((xx[i]**2).sum(axis=1)) vr=abs(np.einsum('ij,ij->i',xx[i],vv[i])/d) return np.sum(d/vr)/np.sum(1./vr) partialfunc=partial(avgr,xx=x,vv=v) out=pool.map(partialfunc,xrange(npart)) distout[:,1]=out if Ncom==2: A=solve.Problem('varh',nlim=lim[0],llim=lim[1]) B=solve.Problem('varc',nlim=lim[0],llim=lim[1]) file.root.totE[:,0]=A.potential(x[:, 0])[:,0]+ B.potential(x[:, 0])[:,0] + \ (v[:,0]**2).sum(axis=1)/2. file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+ B.potential(x[:,-1])[:,0] + \ (v[:,-1]**2).sum(axis=1)/2. del A; del B elif Ncom==1: A=solve.Problem('var',nlim=lim[0],llim=lim[1]) #calculate initial and final energy file.root.totE[:,0]=A.potential(x[:,0])[:,0]+(v[:,0]**2).sum(axis=1)/2. file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+(v[:,-1]**2).sum(axis=1)/2. del A elif Ncom==3: A=solve.Problem('varc',nlim=lim[0],llim=lim[1]) #calculate initial and final energy file.root.totE[:,0]=A.potential(x[:,0])[:,0]+(v[:,0]**2).sum(axis=1)/2. file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+(v[:,-1]**2).sum(axis=1)/2. del A else: print "Number of components not specified" #t_base = Array(ctypes.c_double, npart*nn) #tt = np.ctypeslib.as_array(t_base.get_obj()).reshape(npart,nn) #x_base = Array(ctypes.c_double, npart*nn*3) #xx = np.ctypeslib.as_array(x_base.get_obj()).reshape(npart,nn,3) #v_base = Array(ctypes.c_double, npart*nn*3) #vv = np.ctypeslib.as_array(v_base.get_obj()).reshape(npart,nn,3) #tt[:]=t[:,::interval][:,:nn] #xx[:]=x[:,::interval,:3][:,:nn] #vv[:]=x[:,::interval,3:][:,:nn] #del t #del x t=sharedmem.copy(t[:,::interval][:,:nn]) x=x[:,::interval][:,:nn] v=v[:,::interval][:,:nn] x=np.dot(x,rotmat) v=np.dot(v,rotmat) # Use 8 threads #pool=Pool(8) with sharedmem.MapReduce() as pool: partialfunc=partial(calcclass,t=t,x=x,v=v,nn=nn) out=pool.map(partialfunc,xrange(npart)) #pool.close() #pool.join() file.root.classorb[:]=np.array(out) with sharedmem.MapReduce() as pool: out=pool.map(getclass,file.root.classorb[:,0]) file.root.classification[:]=out
subject2_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format( REPO_HOME_PATH, subject2, alignment) if not path.exists(subject1_file_path): raise ValueError( 'Filing missing: {0}, run preprocess.py to generate necessary file'.format( subject1_file_path)) if not path.exists(subject2_file_path): raise ValueError( 'Filing missing: {0}, run preprocess.py to generate necessary file'.format( subject2_file_path)) return (subject1_file_path, subject2_file_path) if __name__ == '__main__': subject1_file_path, subject2_file_path = check_command_line_arguments( sys.argv) correlation_file_name = '{0}/data/processed/r_sub{1}_sub{2}_{3}'.format( REPO_HOME_PATH, sys.argv[2], sys.argv[3], sys.argv[1]) if not path.exists(correlation_file_name + '.npy') or not cf.USE_CACHED_DATA: shared_subject1 = sm.copy(np.load(subject1_file_path)) gc.collect() shared_subject2 = sm.copy(np.load(subject2_file_path)) gc.collect() voxel_correlations = parallelize_correlation() np.save(correlation_file_name, voxel_correlations) print('Saved {0}'.format(correlation_file_name + '.npy')) else: print('Using cached version of {0}'.format(correlation_file_name + '.npy'))
def negative_binomial(read_dict,peakfilename, swap, parameter): '''the main function that test for significant windows.''' # Initialize the parameters peaktype = parameter.peaktype threshold = parameter.threshold windowsize = parameter.window_size # Indicate the data if parameter.difftest is True: test_list = parameter.chip1 control_list = parameter.chip2 else: test_list = parameter.chip1 control_list = parameter.input1 num_tests = parameter.get_genome_size()/windowsize #compute number of replicates test_rep = len(test_list) control_rep = len(control_list) start1 = 0 end1 = start2 = test_rep end2 = test_rep+control_rep # if swap if swap is True: test_rep, control_rep = control_rep, test_rep #test_list, control_list = control_list, test_list # initialize basic array structures sig_peaks_list = [] # single-core version. if parameter.num_procs <2: for chr in parameter.chr_info: read_array = read_dict[chr] sig_peaks_list.extend(per_chr_nbtest(read_array, chr, swap,threshold, peaktype,parameter, start1,end1,start2,end2,test_rep,control_rep)) # multi-core version else: result_list = [] def log_result(result): result_list.append(result) try: import sharedmem for chr in parameter.chr_info: read_array = read_dict[chr] read_dict[chr] = sharedmem.copy(read_array) except ImportError: print("Import sharedmem package failed") pool = multiprocessing.Pool(processes=parameter.num_procs)#,maxtasksperchild=1) for chr in parameter.chr_info: read_array = read_dict[chr] pool.apply_async(per_chr_nbtest, (read_array, chr, swap,threshold, peaktype, parameter.difftest, start1,end1,start2,end2,test_rep,control_rep),callback=log_result) pool.close() pool.join() sig_peaks_list = list(itertools.chain(*result_list)) #calculate the BH FDR. debug("begin estimating FDR") sig_peaks_list = cal_FDR(sig_peaks_list, num_tests) debug("finished estimating FDR") # merge adjacent significant peaks. info ("Merging adjacent significant windows...") final_peak_list = [] for chr in read_dict: sig_peak_list_by_chr = \ [item for item in sig_peaks_list if item.chr==chr] if len(sig_peak_list_by_chr) == 0: continue # if there is no significant peak in this chromosome, skip it. sig_index = [item.index for item in sig_peak_list_by_chr] sig_pval = [item.pvalue for item in sig_peak_list_by_chr] sig_qval = [item.qvalue for item in sig_peak_list_by_chr] sig_g1_count = [item.g1_count for item in sig_peak_list_by_chr] #print sig_g1_count, len(sig_g1_count) sig_g2_count = [item.g2_count for item in sig_peak_list_by_chr] sig_start, sig_end, sig_fc, sig_pval, sig_qval = merge_sig_window(sig_index, sig_g1_count, sig_g2_count, sig_pval, sig_qval, peaktype) for idx in range(len(sig_start)): final_peak = [chr, sig_start[idx]*windowsize/2, sig_end[idx]*windowsize/2+windowsize, sig_fc[idx], sig_pval[idx], sig_qval[idx]] final_peak_list.append(final_peak) # sort the peak list final_peak_list = sorted(final_peak_list, key=itemgetter(4)) info("%d peaks called.", len(final_peak_list)) if len(final_peak_list) == 0: return #start output peaks. all_fc = [peak[3] for peak in final_peak_list] #print all_fc max_fc = max(all_fc) # write results to peak file. peakfile = open(peakfilename, 'w') for idx, final_peak in enumerate(final_peak_list): chr = final_peak[0] start = final_peak[1] end = final_peak[2] fc = final_peak[3] pval = final_peak[4] qval = final_peak[5] # tentatively, assign the normalized fold change as the score in 5th column score = fc/max_fc*1000 # range from 0 to 1000 peakfile.write( '\t'.join([chr, str(int(start)), str(int(end)), ("chip2" if swap else "chip1") + "_peak_" +str(idx+1), str(score), '.', str(fc), str(pval), str(qval)]) + '\n') return
def tps_multiple( target_words: List[str], word_to_int: dict, neighbourhood_size: int, words_vocabulary: Optional[list] = None, word_embeddings: np.ndarray = None, word_embeddings_normalized: np.ndarray = None, word_embeddings_pairwise_dists: np.ndarray = None, ann_instance: ApproxNN = None, sanity_check: bool = False, return_persistence_diagram: bool = False, n_jobs: int = 1, progressbar_enabled: bool = False, verbose: int = 1, ) -> Union[float, tuple]: """ Computes the topological polysemy (TPS) [1] of words with respect to some word embeddings and neighbourhood size. Parameters ---------- target_words : list of str Target words (w) word_to_int : dict of str and int Dictionary mapping from word to its integer representation. neighbourhood_size : int Neighbourhood size (n) words_vocabulary : list, optional List of either words (str) or r word integer representations (int), signalizing what part of the vocabulary we want to use (defaults to None, i.e., whole vocabulary). word_embeddings : np.ndarray Word embeddings; either word_embeddings or word_embeddings_normalized must be specified (defaults to None). word_embeddings_normalized : np.ndarray, optional Normalized word embeddings; either word_embeddings_normalized or word_embeddings must be specified (defaults to None). word_embeddings_pairwise_dists : np.ndarray, optional Numpy matrix containing pairwise distances between word embeddings (defaults to None). ann_instance : ApproxNN, optional Approximate nearest neighbour (ANN) instance, built on the word embeddings (defaults to None). If specified, the ANN index is used to find punctured neighbourhoods. sanity_check : bool, optional Whether or not to run sanity checks (defaults to False). return_persistence_diagram : bool, optional Whether or not to return persistence diagram (defaults to False). n_jobs : int, optional Number of processes to use (defaults to 1). progressbar_enabled: bool, optional Whether or not the progressbar is enabled (defaults to False). verbose : int, optional Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose). Defaults to 1 (verbose). Returns ------- result : float or tuple TPS values of `target_words` w.r.t. word_embeddings and neighbourhood_size. If return_persistence_diagram is set to true, then a tuple is returned with the TPS values as the first value and the zero degree persistence diagram as the second value. References ---------- .. [1] Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020). Topology of Word Embeddings: Singularities Reflect Polysemy. """ tps_scores = np.zeros_like(target_words, dtype=float) tps_persistence_diagrams = None if return_persistence_diagram: tps_persistence_diagrams = [None] * len(target_words) # Only normalize word embeddings once if word_embeddings_normalized is None: if words_vocabulary is not None: word_vectors = words_to_vectors( words_vocabulary=words_vocabulary, word_to_int=word_to_int, word_embeddings=word_embeddings, ) else: word_vectors = word_embeddings word_embeddings_normalized = word_vectors / np.linalg.norm( word_vectors, axis=1).reshape(-1, 1) if n_jobs == -1: n_jobs = cpu_count() if n_jobs > 1: # Prepare data for multiprocessing if verbose == 1: print("Preparing data for multiprocessing...") word_embeddings_normalized_shared = sharedmem.copy( word_embeddings_normalized) word_embeddings_pairwise_dists_shared = None if word_embeddings_pairwise_dists is not None: word_embeddings_pairwise_dists_shared = sharedmem.copy( word_embeddings_pairwise_dists) if verbose == 1: print("Done!") # Prepare arguments num_data_points_per_process = int(len(target_words) // n_jobs) mp_args = [( word_embeddings_normalized_shared, word_embeddings_pairwise_dists_shared, target_words[target_word_indices_chunk], target_word_indices_chunk, word_to_int, neighbourhood_size, sanity_check, return_persistence_diagram, progressbar_enabled, ) for target_word_indices_chunk in batch_list_gen( np.arange(len(target_words)), num_data_points_per_process)] # Run MP if verbose == 1: print(f"Computing TPS using {n_jobs} processes...") mp_var_dict["ann_instance"] = ann_instance with sharedmem.MapReduce(np=n_jobs) as pool: mp_results = pool.map(tps_multiple_by_mp_args, mp_args) for tps_result, target_word_indices in mp_results: if return_persistence_diagram: tps_result_scores, tps_result_pds = tps_result tps_scores[target_word_indices] = tps_result_scores for i, pds_idx in enumerate(target_word_indices): tps_persistence_diagrams[pds_idx] = tps_result_pds[i] else: tps_scores[target_word_indices] = tps_result else: for i, target_word in enumerate( tqdm(target_words, disable=not progressbar_enabled)): tps_result = tps( target_word=target_word, word_to_int=word_to_int, neighbourhood_size=neighbourhood_size, word_embeddings_normalized=word_embeddings_normalized, word_embeddings_pairwise_dists=word_embeddings_pairwise_dists, ann_instance=ann_instance, sanity_check=sanity_check, return_persistence_diagram=return_persistence_diagram, ) if return_persistence_diagram: tps_scores[i], tps_persistence_diagrams[i] = tps_result else: tps_scores[i] = tps_result if return_persistence_diagram: return tps_scores, tps_persistence_diagrams else: return tps_scores
def negative_binomial(read_dict, peakfilename, swap, parameter): '''the main function that test for significant windows.''' print len(read_dict) # Initialize the parameters peaktype = parameter.peaktype threshold = parameter.threshold windowsize = parameter.window_size # Indicate the data if parameter.difftest is True: test_list = parameter.chip1 control_list = parameter.chip2 else: test_list = parameter.chip1 control_list = parameter.input1 num_tests = parameter.get_genome_size() / windowsize #compute number of replicates test_rep = len(test_list) control_rep = len(control_list) start1 = 0 end1 = start2 = test_rep end2 = test_rep + control_rep # if swap if swap is True: test_rep, control_rep = control_rep, test_rep #test_list, control_list = control_list, test_list # initialize basic array structures sig_peaks_list = [] # single-core version. if parameter.num_procs < 2: for chr in parameter.chr_info: read_array = read_dict[chr] sig_peaks_list.extend( per_chr_nbtest(read_array, chr, swap, threshold, peaktype, parameter, start1, end1, start2, end2, test_rep, control_rep)) # multi-core version else: result_list = [] def log_result(result): result_list.append(result) try: import sharedmem for chr in parameter.chr_info: read_array = read_dict[chr] read_dict[chr] = sharedmem.copy(read_array) except ImportError: print "Import sharedmem package failed" pool = multiprocessing.Pool( processes=parameter.num_procs) #,maxtasksperchild=1) for chr in parameter.chr_info: read_array = read_dict[chr] pool.apply_async(per_chr_nbtest, (read_array, chr, swap, threshold, peaktype, parameter.difftest, start1, end1, start2, end2, test_rep, control_rep), callback=log_result) pool.close() pool.join() sig_peaks_list = list(itertools.chain(*result_list)) #calculate the BH FDR. debug("begin estimating FDR") sig_peaks_list = cal_FDR(sig_peaks_list, num_tests) debug("finished estimating FDR") # merge adjacent significant peaks. info("Merging adjacent significant windows...") final_peak_list = [] for chr in read_dict: sig_peak_list_by_chr = \ [item for item in sig_peaks_list if item.chr==chr] if len(sig_peak_list_by_chr) == 0: continue # if there is no significant peak in this chromosome, skip it. sig_index = [item.index for item in sig_peak_list_by_chr] sig_pval = [item.pvalue for item in sig_peak_list_by_chr] sig_qval = [item.qvalue for item in sig_peak_list_by_chr] sig_g1_count = [item.g1_count for item in sig_peak_list_by_chr] #print sig_g1_count, len(sig_g1_count) sig_g2_count = [item.g2_count for item in sig_peak_list_by_chr] sig_start, sig_end, sig_fc, sig_pval, sig_qval = merge_sig_window( sig_index, sig_g1_count, sig_g2_count, sig_pval, sig_qval, peaktype) for idx in range(len(sig_start)): final_peak = [ chr, sig_start[idx] * windowsize / 2, sig_end[idx] * windowsize / 2 + windowsize, sig_fc[idx], sig_pval[idx], sig_qval[idx] ] final_peak_list.append(final_peak) # sort the peak list final_peak_list = sorted(final_peak_list, key=itemgetter(4)) info("%d peaks called.", len(final_peak_list)) if len(final_peak_list) == 0: return #start output peaks. all_fc = [peak[3] for peak in final_peak_list] #print all_fc max_fc = max(all_fc) # write results to peak file. peakfile = open(peakfilename, 'w') for idx, final_peak in enumerate(final_peak_list): chr = final_peak[0] start = final_peak[1] end = final_peak[2] fc = final_peak[3] pval = final_peak[4] qval = final_peak[5] # tentatively, assign the normalized fold change as the score in 5th column score = fc / max_fc * 1000 # range from 0 to 1000 peakfile.write('\t'.join([ chr, str(start), str(end), ("chip2" if swap else "chip1") + "_peak_" + str(idx + 1), str(score), '.', str(fc), str(pval), str(qval) ]) + '\n') return
def compute_gad( data_points: np.ndarray, manifold_dimension: int, annulus_inner_radius: float = None, annulus_outer_radius: float = None, data_point_ints: list = None, data_points_pairwise_distances: np.ndarray = None, data_points_approx_nn: ApproxNN = None, data_points_distance_metric: Callable = fastdist.euclidean, use_ripser_plus_plus: bool = False, ripser_plus_plus_threshold: int = 200, use_knn_annulus: bool = False, knn_annulus_inner: int = None, knn_annulus_outer: int = None, knn_annulus_metric: Callable = fastdist.euclidean, knn_annulus_metric_name: str = "euclidean", return_annlus_persistence_diagrams: bool = False, progressbar_enabled: bool = False, n_jobs: int = 1, verbose: int = 1, ) -> dict: """ Computes geometric anomaly detection (GAD) Procedure 1 from [1]. Parameters ---------- data_points : np.ndarray All data points. manifold_dimension : int Manifold homology dimension (k parameter in [1]). annulus_inner_radius : float Inner annulus radius. annulus_outer_radius : float Outer annulus radius. data_point_ints : np.ndarray Array specifying which data point indices are used from all the data points. data_points_pairwise_distances : np.ndarray, optional Pairwise distances of data points (defaults to None). data_points_approx_nn : ApproxNN, optional ApproxNN instance (defaults to None). data_points_distance_metric : Callable, optional Distance metric callable to compute exact distance between any two data points (defaults to euclidean distance, `fastdist.euclidean`). use_ripser_plus_plus : bool Whether or not to use Ripser++ (GPU acceleration). ripser_plus_plus_threshold : int The least number of data points in order to use Ripser++, only has an effect if `use_ripser_plus_plus` is set to True. use_knn_annulus : bool Whether or not to use the KNN verison of GAD. knn_annulus_inner : int Number of neighbours to determine inner annulus radius. knn_annulus_outer : int Number of neighbours to determine outer annulus radius. knn_annulus_metric : Callable fastdist metric; only required if `data_points_pairwise_distances` and `data_points_approx_nn` are None (defaults to fastdist.euclidean). knn_annulus_metric_name : str String name of the `knn_annulus_metric` callable (defaults to "euclidean"). return_annlus_persistence_diagrams : bool Whether or not to return annulus persistence diagrams. progressbar_enabled : bool Whether or not the tqdm progressbar is enabled. n_jobs : int, optional Number of processes to use (defaults 1, -1 denotes all processes). verbose : int, optional Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose). Defaults to 1 (verbose). Returns ------- result : dict Result dictionary consisting of: "P_man" : list List of point indices of k-manifold points. "P_bnd" : list List of point indices of boundary points. "P_int" : list List of point indices of intersection points. "annlus_persistence_diagrams" : list List of persistence diagrams of annulus points, if `return_annlus_persistence_diagrams` is set to True. References ---------- .. [1] Bernadette J Stolz, Jared Tanner, Heather A Harrington, & Vidit Nanda. (2019). Geometric anomaly detection in data. """ if data_point_ints is None: data_point_ints = np.arange(len(data_points)) # Get distance function distance_func = get_point_distance_func( data_points=data_points, pairwise_distances=data_points_pairwise_distances, metric_callable=data_points_distance_metric, ) # Get KNN annulus function, use_knn_annulus is True knn_func = None if use_knn_annulus: knn_func = get_knn_func_data_points( data_points=data_points, pairwise_distances=data_points_pairwise_distances, approx_nn=data_points_approx_nn, metric=knn_annulus_metric, metric_name=knn_annulus_metric_name, ) target_homology_dim = manifold_dimension - 1 if n_jobs == -1: n_jobs = cpu_count() if n_jobs > 1: # Initialize MP results results = { "P_bnd": [], "P_man": [], "P_int": [], } if return_annlus_persistence_diagrams: results["annulus_pds"] = {} # Prepare data for multiprocessing if verbose == 1: print("Preparing data for multiprocessing...") data_points_shared = sharedmem.copy(data_points) # data_points_raw = Array( # "d", data_points.shape[0] * data_points.shape[1], lock=False # ) # data_points_raw_np = np.frombuffer(data_points_raw).reshape(data_points.shape) # np.copyto(data_points_raw_np, data_points) if verbose == 1: print("Done!") # Prepare arguments num_data_points_per_process = int(len(data_point_ints) // n_jobs) mp_args = [ ( data_points_shared, data_point_ints_chunk, annulus_inner_radius, annulus_outer_radius, use_knn_annulus, knn_annulus_inner, knn_annulus_outer, target_homology_dim, use_ripser_plus_plus, ripser_plus_plus_threshold, return_annlus_persistence_diagrams, ) for data_point_ints_chunk in batch_list_gen( data_point_ints, num_data_points_per_process ) ] mp_var_dict["distance_func"] = distance_func if knn_func is not None: mp_var_dict["knn_func"] = knn_func # Run MP if verbose == 1: print(f"Computing GAD using {n_jobs} processes...") with sharedmem.MapReduce(np=n_jobs) as pool: mp_results = pool.map(compute_gad_point_indices_mp, mp_args) for result in mp_results: results["P_man"].extend(result["P_man"]) results["P_bnd"].extend(result["P_bnd"]) results["P_int"].extend(result["P_int"]) if return_annlus_persistence_diagrams: results["annulus_pds"].update(result["annulus_pds"]) # with Pool( # processes=n_jobs, # initializer=compute_gad_mp_init, # initargs=(data_points_raw_np, data_points.shape, distance_func, knn_func), # ) as pool: # for result in tqdm( # pool.imap_unordered(compute_gad_point_indices_mp, grid_search_args), # total=n_jobs, # disable=not progressbar_enabled, # ): # results["P_man"].extend(result["P_man"]) # results["P_bnd"].extend(result["P_bnd"]) # results["P_int"].extend(result["P_int"]) # if return_annlus_persistence_diagrams: # results["annulus_pds"].update(result["annulus_pds"]) else: # Compute GAD using only one processor if verbose == 1: print("Computing GAD...") results = compute_gad_point_indices( data_point_indices=data_point_ints, data_points=data_points, annulus_inner_radius=annulus_inner_radius, annulus_outer_radius=annulus_outer_radius, distance_func=distance_func, use_knn_annulus=use_knn_annulus, knn_func=knn_func, knn_annulus_inner=knn_annulus_inner, knn_annulus_outer=knn_annulus_outer, target_homology_dim=target_homology_dim, use_ripser_plus_plus=use_ripser_plus_plus, ripser_plus_plus_threshold=ripser_plus_plus_threshold, return_annlus_persistence_diagrams=return_annlus_persistence_diagrams, progressbar_enabled=progressbar_enabled, ) return results