Example #1
0
    def addManagedData(self, data):
        """
        Set the data of the network, used for managing multiple input data sources for training
        :param data: training data and labels specified as dictionary
        :return: None
        """

        if not isinstance(data, dict):
            raise ValueError("Error: expected dictionary for data!")

        for key in data:
            # check sizes
            if data[key].shape[0] != self.numTrainSamplesMB:
                raise ValueError("Number of samples must be the same as number of labels.")

            if self.getNumMacroBatches() > 1:
                if self.cfgParams.para_load is True:
                    setattr(self, key+'DB', sharedmem.copy(data[key][0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]))
                    setattr(self, key+'DBlast', sharedmem.copy(self.alignData(data[key][(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=data[key])))
                else:
                    # save memory, we do not need extra sharedmem
                    setattr(self, key+'DB', data[key][0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()])
                    setattr(self, key+'DBlast', self.alignData(data[key][(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=data[key]))
                self.managedVar.append(key)
            else:
                if self.cfgParams.para_load is True:
                    setattr(self, key+'DB', sharedmem.copy(self.alignData(data[key])))
                else:
                    # save memory, we do not need extra sharedmem
                    setattr(self, key + 'DB', self.alignData(data[key]))
            self.trainingVar.append(key)

            # shared variable already exists?
            if hasattr(self, key):
                print("Reusing shared variables!")
                if self.trainSize > self.getGPUMemAligned():
                    print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned()))
                    # load first macro batch
                    idx = self.getNumSamplesPerMacroBatch()
                    self.replaceTrainingData(0, idx)
                else:
                    print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned()))
                    self.replaceTrainingData(0, self.train_data_xDB.shape[0])
            else:
                # load shared data
                if self.trainSize > self.getGPUMemAligned():
                    print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned()))
                    # load first macro batch
                    idx = self.getNumSamplesPerMacroBatch()
                    setattr(self, key, theano.shared(getattr(self, key+'DB')[:idx], name=key, borrow=True))
                else:
                    print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned()))
                    setattr(self, key, theano.shared(getattr(self, key+'DB'), name=key, borrow=True))
def mapper(f, pars, *argv, **kwarg):
    # create a shared object from X
    x_s = [sharedmem.copy(x) for x in argv]
    # parallel process over shared object
    t0 = time()
    with mp.Pool(3) as pool:
        res = pool.starmap_async(f, [(p, *x_s, *list(kwarg.values()))
                                     for p in pars])
        a = res.get()
    pool.close()
    pool.join()
    return a
Example #3
0
def shm_gaukernop_at(x, xp, y, data):
    y = np.ascontiguousarray(y)
    x_shm = shm.copy(x)
    xp_shm = shm.copy(xp)
    y_shm = shm.copy(y)
    data_shm = shm.copy(data)

    nthread = int(os.environ["TENSIGA_NUM_THREADS"])

    result = shm.empty(y.shape, np.float)
    with shm.MapReduce(np=nthread) as pool:
        def row(k):
            d = x_shm[k,:] - xp_shm
            norm = np.sqrt(np.sum(d**2, axis=1))
            return k, ((data_shm[0]**2) * np.exp(-(norm/(data_shm[1]*data_shm[2]))**2)) @ y_shm

        def reduce(k, coeff):
            result[k] = coeff

        r = pool.map(row, np.arange(x_shm.shape[0]), reduce=reduce)

    return result
Example #4
0
 def __make_np_arrays_sharable(self):
     """
     Replaces all numpy array object variables with dimension 
     > 0 with a sharedmem array, which should have the same  
     behaviour / properties as the numpy array
     """
     varDict = self.__dict__
     for key, var in varDict.items():
         if type(var) is np.ndarray:
             if not key in self.exclude:
                 try:
                     varDict[key] = sharedmem.copy(var)
                 except AttributeError:
                     share_var = sharedmem.empty(1, type(var))
                     share_var[0] = var
                     varDict[key] = share_var
Example #5
0
def MakeEigenModes(powerspec, templatecorrfunc):
    """ create fitting eigenstates for a given powerspectrum
        see Slosar paper.

        basically getting out the poles of the powerspectrum,
        then evaluate them on the grid given in templatecorrfunc.
        this is used in bootstrap (for per sample)

        usually, the eigenmodes are ordered first by the correlation functions, 
        (QQ QF, FF) then by the multipole order (0, 2, 4)
    """
    dummy = templatecorrfunc

    eigenmodes = []
    N = len(dummy.compress())
    annotation = []

    for i in range(len(dummy)):
        for order in [0, 2, 4]:
            annotation.append((i, order))
            eigenmode = dummy.copy()
            eigenmode.uncompress(numpy.zeros(N))
            eigenmode[i].xi = sharedmem.copy(eigenmode[i].xi)
            eigenmodes.append(eigenmode)

    with sharedmem.MapReduce() as pool:
        def work(j):
            i, order = annotation[j]
            c = numpy.zeros(5)
            c[order] = 1
            # watch out the eigenmode is wikipedia
            # no negative phase on 2nd order pole!
            eigenmode = eigenmodes[j]
            eigenmode[i].xi[...] = \
                    powerspec.pole(eigenmode[i].r, order=order)[:, None] \
                    * legval(eigenmode[i].mu, c)[None, :]
        pool.map(work, range(len(annotation)))
    # the eigenmodes are ordered first by the correlation functions, (QQ QF, FF)
    # then by the multipole order (0, 2, 4)
    #
    for j in  range(len(annotation)):
        i, order =annotation[j]
        eigenmode = eigenmodes[j]
        eigenmode[i].xi = numpy.array(eigenmode[i].xi, copy=True)

    return EigenModes(eigenmodes)
Example #6
0
def shm_chunk_gaukernop_at(x, xp, y, data):
    nthread = int(os.environ["TENSIGA_NUM_THREADS"])

    chunk_size = x.shape[0]//nthread
    last_chunk_size = chunk_size + x.shape[0] % nthread

    indices_start = [ chunk_size*k for k in range(nthread-1) ]
    indices_start.append(chunk_size*(nthread-1))
    indices_start = shm.copy(np.array(indices_start))

    indices_stop = [ chunk_size*(k+1) for k in range(nthread-1) ]
    indices_stop.append(chunk_size*(nthread-1) + last_chunk_size)
    indices_stop = shm.copy(np.array(indices_stop))
    
    y = np.ascontiguousarray(y)
    x = shm.copy(x)
    xp = shm.copy(xp)
    y = shm.copy(y)
    data = shm.copy(data)

    result = shm.empty((y.shape[0],1), np.float)

    with shm.MapReduce(np=nthread) as pool:
        @jit(fastmath=True)
        def row(k):
            xslice = x[slice(indices_start[k], indices_stop[k]),:]
            res = np.empty((xslice.shape[0],1)) 
            for l in range(xslice.shape[0]):
                d = xslice[l,:] - xp
                norm = np.sqrt(np.sum(d**2, axis=1))
                res[l] = ((data[0]**2) * np.exp(-(norm/(data[1]*data[2]))**2)) @ y

            return k, res 

        def reduce(k, coeff):
            result[slice(indices_start[k], indices_stop[k])] = coeff

        r = pool.map(row, np.arange(nthread), reduce=reduce)

    return result
Example #7
0
def refineFit(distMat,
              sample_names,
              start_s,
              mean0,
              mean1,
              max_move,
              min_move,
              slope=2,
              no_local=False,
              num_processes=1):
    """Try to refine a fit by maximising a network score based on transitivity and density.

    Iteratively move the decision boundary to do this, using starting point from existing model.

    Args:
        distMat (numpy.array)
            n x 2 array of core and accessory distances for n samples
        sample_names (list)
            List of query sequence labels
        start_s (float)
            Point along line to start search
        mean0 (numpy.array)
            Start point to define search line
        mean1 (numpy.array)
            End point to define search line
        max_move (float)
            Maximum distance to move away from start point
        min_move (float)
            Minimum distance to move away from start point
        slope (int)
            Set to 0 for a vertical line, 1 for a horizontal line, or
            2 to use a slope
        no_local (bool)
            Turn off the local optimisation step.
            Quicker, but may be less well refined.
        num_processes (int)
            Number of threads to use in the global optimisation step.

            (default = 1)
    Returns:
        start_point (tuple)
            (x, y) co-ordinates of starting point
        optimal_x (float)
            x-coordinate of refined fit
        optimal_y (float)
            y-coordinate of refined fit
    """
    sys.stderr.write("Initial boundary based network construction\n")
    start_point = transformLine(start_s, mean0, mean1)
    sys.stderr.write("Decision boundary starts at (" +
                     "{:.2f}".format(start_point[0]) + "," +
                     "{:.2f}".format(start_point[1]) + ")\n")

    # Boundary is left of line normal to this point and first line
    gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])

    # ALTERNATIVE - use a single network
    # Move boundary along in steps, and find those samples which have changed
    # Use remove_edges/add_edges with index k lookup (n total) to find sample IDs
    # https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix
    # i = n - 2 - int(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
    # j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2

    # Optimize boundary - grid search for global minimum
    sys.stderr.write("Trying to optimise score globally\n")
    global_grid_resolution = 40  # Seems to work
    shared_dists = sharedmem.copy(distMat)
    s_range = np.linspace(-min_move, max_move, num=global_grid_resolution)
    with sharedmem.MapReduce(np=num_processes) as pool:
        global_s = pool.map(
            partial(newNetwork,
                    sample_names=sample_names,
                    distMat=shared_dists,
                    start_point=start_point,
                    mean1=mean1,
                    gradient=gradient,
                    slope=slope), s_range)

    # Local optimisation around global optimum
    min_idx = np.argmin(np.array(global_s))
    if min_idx > 0 and min_idx < len(s_range) - 1 and not no_local:
        sys.stderr.write("Trying to optimise score locally\n")
        local_s = scipy.optimize.minimize_scalar(
            newNetwork,
            bounds=[s_range[min_idx - 1], s_range[min_idx + 1]],
            method='Bounded',
            options={'disp': True},
            args=(sample_names, distMat, start_point, mean1, gradient, slope))
        optimised_s = local_s.x
    else:
        optimised_s = s_range[min_idx]

    optimised_coor = transformLine(optimised_s, start_point, mean1)
    if slope == 2:
        optimal_x, optimal_y = decisionBoundary(optimised_coor, gradient)
    else:
        optimal_x = optimised_coor[0]
        optimal_y = optimised_coor[1]

    if optimal_x < 0 or optimal_y < 0:
        raise RuntimeError(
            "Optimisation failed: produced a boundary outside of allowed range\n"
        )

    return start_point, optimal_x, optimal_y
Example #8
0
        plt.imshow(np.hstack((img, img2)))

    ext = file_name.split('.')[-1]
    
    if ext != 'bz2':
        main_img = imread(file_name, mode="L")
        img_name = file_name.replace(
            '.' + ext, '_{}_{}.{}'.format(scale, sigma, ext))
        test_one(img_name, main_img, scale, sigma)
    
    else:
        import sharedmem
        main_imgs = read_json_bz2(file_name)
        main_imgs = main_imgs.astype('float')
        print main_imgs.dtype
        shrd_imgs = sharedmem.copy(main_imgs)
        wd = ht = int(main_imgs.shape[1] ** .5)

        batch_sz = 100
        h = hpy()
        deformer = Deformer(shrd_imgs, batch_sz, (ht, wd), scale, sigma, 1)
        print deformer
        for ibatch in deformer:
            print 'Processing Imgs : ', ibatch * batch_sz, '-', (ibatch+1 )* batch_sz,
            print shrd_imgs[ibatch*batch_sz].max(), shrd_imgs[ibatch*batch_sz].min()
            for i in range(batch_sz):
                iimg = ibatch * batch_sz + i
                img_name = 'tmp/' + file_name.replace(
                                        '.'+ext, '_{}.{}'.format(iimg, 'tif'))
                composite = np.vstack((main_imgs[iimg].reshape(ht, wd),
                                       shrd_imgs[iimg].reshape(ht, wd)))
Example #9
0
def lasso_binary_classification(image_list,
                                diagnosis_list,
                                output_directory,
                                existing_gram_matrix=None,
                                mask_zeros=True,
                                scale_data=False,
                                positive=False,
                                outer_folds=10,
                                inner_folds=10,
                                n_threads=10,
                                alphas=np.arange(0.1, 1.1, 0.1),
                                save_gram_matrix=False,
                                save_subject_classification=False,
                                save_weights=True,
                                save_features_image=True):

    results = dict()
    dx_filter = np.unique(diagnosis_list)

    print 'Loading ' + str(len(image_list)) + ' subjects'
    x0, orig_shape, data_mask = load_data(image_list, mask=mask_zeros)
    print 'Subjects loaded'

    if scale_data:
        x_all = scale(x0)
    else:
        x_all = x0

    # if existing_gram_matrix is not None:
    #     gram_matrix = existing_gram_matrix
    #     if (gram_matrix.shape[0] != gram_matrix.shape[1]) | (gram_matrix.shape[0] != len(image_list)):
    #         raise ValueError('The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.')
    # else:
    #     print 'Calculating Gram matrix'
    #     gram_matrix = gram_matrix_linear(x_all)
    #     print 'Gram matrix calculated'

    # if save_gram_matrix:
    #     np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix)

    # BaseManager.register('ndarray', type(x_all))
    # manager = BaseManager()
    # manager.start()
    # shared_x = manager.ndarray(x_all.shape, buffer=x_all)

    shared_x = sharedmem.copy(x_all)
    x_all = None
    gc.collect()

    for i in range(len(dx_filter)):
        for j in range(i + 1, len(dx_filter)):
            dx1 = dx_filter[i]
            dx2 = dx_filter[j]

            ind1 = []
            ind2 = []
            for k in range(len(diagnosis_list)):
                if diagnosis_list[k] == dx1:
                    ind1.append(k)
                if diagnosis_list[k] == dx2:
                    ind2.append(k)

            indices = ind1 + ind2
            indices = np.array(indices)

            current_subjects = [image_list[k] for k in indices]
            current_diagnosis = [diagnosis_list[k] for k in indices]

            # x = x_all[indices, :]
            y = np.array([0] * len(ind1) + [1] * len(ind2))
            # gm = gram_matrix[indices, :][:, indices]

            classification_str = dx1 + '_vs_' + dx2 + ('_positive'
                                                       if positive else '')
            print 'Running ' + dx1 + ' vs ' + dx2 + ' classification'

            y_hat, coefficients, intersect, alpha = nested_folds(
                shared_x,
                indices,
                y,
                alphas,
                positive=positive,
                outer_folds=outer_folds,
                inner_folds=inner_folds,
                n_threads=n_threads)
            evaluation = evaluate_prediction(y, y_hat)

            print '\nTrue positive %0.2f' % len(evaluation['predictions'][0])
            print 'True negative %0.2f' % len(evaluation['predictions'][1])
            print 'False positive %0.2f' % len(evaluation['predictions'][2])
            print 'False negative %0.2f' % len(evaluation['predictions'][3])

            print 'Accuracy %0.2f' % evaluation['accuracy']
            print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy']
            print 'Sensitivity %0.2f' % evaluation['sensitivity']
            print 'Specificity %0.2f' % evaluation['specificity']
            print 'Positive predictive value %0.2f' % evaluation['ppv']
            print 'Negative predictive value %0.2f \n' % evaluation['npv']

            if save_weights or save_features_image:
                weights_orig = revert_mask(coefficients, data_mask, orig_shape)

            if save_weights:
                np.save(
                    join(output_directory, classification_str + '__intersect'),
                    intersect)
                np.save(
                    join(output_directory, classification_str + '__weights'),
                    weights_orig)

            if save_features_image:
                weights_to_nifti(
                    weights_orig, image_list[0],
                    join(output_directory,
                         classification_str + '__features_image.nii'))

            if save_subject_classification:
                save_subjects_prediction(
                    current_subjects, current_diagnosis, y, y_hat,
                    join(output_directory,
                         classification_str + '__subjects.csv'))

            results[(dx1, dx2)] = evaluate_prediction(y, y_hat)

    results_to_csv(
        results, dx_filter,
        join(output_directory,
             'resume' + ('_positive' if positive else '') + '.csv'))
Example #10
0
def svm_binary_classification(input_image_atlas,
                              subjects_visits_tsv,
                              image_list,
                              diagnosis_list,
                              output_directory,
                              kernel_function=None,
                              existing_gram_matrix=None,
                              mask_zeros=True,
                              scale_data=False,
                              balanced=False,
                              outer_folds=10,
                              inner_folds=10,
                              n_threads=10,
                              c_range=np.logspace(-10, 2, 1000),
                              save_gram_matrix=False,
                              save_subject_classification=False,
                              save_dual_coefficients=False,
                              scaler=None,
                              data_mask=None,
                              save_original_weights=False,
                              save_features_image=True):

    if (kernel_function is None and existing_gram_matrix is None) | (
            kernel_function is not None and existing_gram_matrix is not None):
        raise ValueError(
            'Kernel_function and existing_gram_matrix are mutually exclusive parameters.'
        )

    results = dict()
    dx_filter = np.unique(diagnosis_list)

    print 'Loading ' + str(len(image_list)) + ' subjects'
    x0 = load_data(image_list, subjects_visits_tsv)
    print 'Subjects loaded'
    if scale_data:
        x_all = scale(x0)
    else:
        x_all = x0

    if existing_gram_matrix is None:
        if kernel_function is not None:
            print 'Calculating Gram matrix'
            gram_matrix = kernel_function(x_all)
            print 'Gram matrix calculated'
        else:
            raise ValueError(
                'If a Gram matrix is not provided a function to calculate it (kernel_function) is a required input.'
            )
    else:
        gram_matrix = existing_gram_matrix
        if (gram_matrix.shape[0] != gram_matrix.shape[1]) | (
                gram_matrix.shape[0] != len(image_list)):
            raise ValueError(
                'The existing Gram matrix must be a square matrix with number of rows and columns equal to the number of images.'
            )

    if save_gram_matrix:
        np.savetxt(join(output_directory, 'gram_matrix.txt'), gram_matrix)

    shared_x = sharedmem.copy(x_all)
    x_all = None
    gc.collect()

    for i in range(len(dx_filter)):
        for j in range(i + 1, len(dx_filter)):
            print j
            dx1 = dx_filter[i]
            dx2 = dx_filter[j]

            ind1 = []
            ind2 = []
            for k in range(len(diagnosis_list)):
                if diagnosis_list[k] == dx1:
                    ind1.append(k)
                if diagnosis_list[k] == dx2:
                    ind2.append(k)

            indices = ind1 + ind2

            current_subjects = [image_list[k] for k in indices]
            current_diagnosis = [diagnosis_list[k] for k in indices]

            y = np.array([0] * len(ind1) + [1] * len(ind2))
            gm = gram_matrix[indices, :][:, indices]

            classification_str = dx1 + '_vs_' + dx2 + ('_balanced' if balanced
                                                       else '_not_balanced')
            print 'Running ' + dx1 + ' vs ' + dx2 + ' classification'

            y_hat, dual_coefficients, sv_indices, intersect, c, auc = cv_svm(
                gm,
                shared_x,
                np.array(indices),
                y,
                c_range,
                balanced=balanced,
                outer_folds=outer_folds,
                inner_folds=inner_folds,
                n_threads=n_threads)

            evaluation = evaluate_prediction(y, y_hat)
            evaluation['auc'] = auc

            print '\nTrue positive %0.2f' % len(evaluation['predictions'][0])
            print 'True negative %0.2f' % len(evaluation['predictions'][1])
            print 'False positive %0.2f' % len(evaluation['predictions'][2])
            print 'False negative %0.2f' % len(evaluation['predictions'][3])

            print 'AUC %0.2f' % auc
            print 'Accuracy %0.2f' % evaluation['accuracy']
            print 'Balanced accuracy %0.2f' % evaluation['balanced_accuracy']
            print 'Sensitivity %0.2f' % evaluation['sensitivity']
            print 'Specificity %0.2f' % evaluation['specificity']
            print 'Positive predictive value %0.2f' % evaluation['ppv']
            print 'Negative predictive value %0.2f \n' % evaluation['npv']

            if save_dual_coefficients:
                np.save(
                    join(output_directory,
                         classification_str + '__dual_coefficients'),
                    dual_coefficients[0])
                np.save(
                    join(output_directory,
                         classification_str + '__sv_indices'), sv_indices)
                np.save(
                    join(output_directory, classification_str + '__intersect'),
                    intersect)

            if save_original_weights or save_features_image:
                weights_orig = features_weights(current_subjects,
                                                dual_coefficients[0],
                                                sv_indices, scaler, data_mask)

            if save_original_weights:
                np.save(
                    join(output_directory, classification_str + '__weights'),
                    weights_orig)

            if save_features_image:
                output_image = weights_to_nifti(input_image_atlas,
                                                weights_orig)
                output_image.to_filename(
                    join(output_directory,
                         classification_str + '__weights.nii'))

            if save_subject_classification:
                save_subjects_prediction(
                    current_subjects, current_diagnosis, y, y_hat,
                    join(output_directory,
                         classification_str + '__subjects.tsv'))

            results[(dx1, dx2)] = evaluation  # evaluate_prediction(y, y_hat)

    results_to_tsv(
        results, dx_filter,
        join(
            output_directory, 'resume' +
            ('_balanced' if balanced else '_not_balanced') + '.tsv'))
    shared_x = None
    gc.collect()
Example #11
0
    def setData(self, train_data, train_y, val_data, val_y, max_train_size=0):
        """
        Set the data of the network, assuming train size << val size
        :param train_data: training data
        :param train_y: training labels
        :param val_data: validation data
        :param val_y: validation labels
        :param max_train_size: optional if training data has additional large chunk
        :return: None
        """

        # check sizes
        if (train_data.shape[0] != train_y.shape[0]) or (val_data.shape[0] != val_y.shape[0]):
            raise ValueError("Number of samples must be the same as number of labels.")

        # Check if the train_y is the image
        self.trainSize = max(train_data.nbytes, train_y.nbytes, max_train_size) / 1024. / 1024.
        self.numTrainSamplesMB = train_data.shape[0]
        self.numTrainSamples = self.numTrainSamplesMB
        self.numValSamples = val_data.shape[0]
        self.sampleSize = self.trainSize / self.numTrainSamplesMB

        # at least one minibatch per macro
        assert self.memorySize > self.sampleSize*self.cfgParams.batch_size, "{} > {}".format(self.memorySize, self.sampleSize*self.cfgParams.batch_size)

        # shrink macro batch size to smallest possible
        if self.getNumMacroBatches() == 1:
            self.memorySize = self.sampleSize * numpy.ceil(self.numTrainSamplesMB/float(self.cfgParams.batch_size)) * self.cfgParams.batch_size

        # keep backup of original data
        # pad last macro batch separately to save memory
        if self.getNumMacroBatches() > 1:
            if self.cfgParams.para_load is True:
                self.train_data_xDB = sharedmem.copy(train_data[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()])
                self.train_data_xDBlast = sharedmem.copy(self.alignData(train_data[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_data))
                self.train_data_yDB = sharedmem.copy(train_y[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()])
                self.train_data_yDBlast = sharedmem.copy(self.alignData(train_y[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_y))
            else:
                # save memory, we do not need extra sharedmem
                self.train_data_xDB = train_data[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]
                self.train_data_xDBlast = self.alignData(train_data[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_data)
                self.train_data_yDB = train_y[0:(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch()]
                self.train_data_yDBlast = self.alignData(train_y[(self.getNumMacroBatches()-1)*self.getNumSamplesPerMacroBatch():], fillData=train_y)
            self.managedVar.append('train_data_x')
            self.managedVar.append('train_data_y')
        else:
            if self.cfgParams.para_load is True:
                self.train_data_xDB = sharedmem.copy(self.alignData(train_data))
                self.train_data_yDB = sharedmem.copy(self.alignData(train_y))
            else:
                # save memory, we do not need extra sharedmem
                self.train_data_xDB = self.alignData(train_data)
                self.train_data_yDB = self.alignData(train_y)
        self.trainingVar.append('train_data_x')
        self.trainingVar.append('train_data_y')

        # no need to cache validation data
        self.val_data_xDB = val_data
        self.val_data_yDB = val_y

        print("Train size: {}MB, Memory available: {}MB, sample size: {}MB, aligned memory: {}MB".format(
            self.trainSize, self.memorySize, self.sampleSize, self.getGPUMemAligned()))
        print("{} train samples, {} val samples, batch size {}".format(
            train_data.shape[0], val_data.shape[0], self.cfgParams.batch_size))
        print("{} macro batches, {} mini batches per macro, {} full mini batches total".format(
            self.getNumMacroBatches(), self.getNumMiniBatchesPerMacroBatch(), self.getNumMiniBatches()))
        print("{} data chunks, {} train samples total".format(self.numChunks, self.numTrainSamples))

        # shared variable already exists?
        if hasattr(self, 'train_data_x'):
            print("Reusing shared variables!")
            if self.trainSize > self.getGPUMemAligned():
                print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned()))
                # load first macro batch
                idx = self.getNumSamplesPerMacroBatch()
                self.replaceTrainingData(0, idx)
                self.replaceValData(self.val_data_xDB, self.val_data_yDB)
            else:
                print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned()))
                self.replaceTrainingData(0, self.train_data_xDB.shape[0])
                self.replaceValData(self.val_data_xDB, self.val_data_yDB)
        else:
            # load shared data
            if self.trainSize > self.getGPUMemAligned():
                print("Loading {} macro batches a {}MB".format(self.getNumMacroBatches(), self.getGPUMemAligned()))
                # load first macro batch
                idx = self.getNumSamplesPerMacroBatch()
                self.train_data_x = theano.shared(self.train_data_xDB[:idx], name='train_data_x', borrow=True)
                self.train_data_y = theano.shared(self.train_data_yDB[:idx], name='train_data_y', borrow=True)
                self.val_data_x = theano.shared(self.val_data_xDB, name='val_data_x', borrow=True)
                self.val_data_y = theano.shared(self.val_data_yDB, name='val_data_y', borrow=True)
            else:
                print("Loading single macro batch {}/{}MB".format(self.trainSize, self.getGPUMemAligned()))
                self.train_data_x = theano.shared(self.train_data_xDB, name='train_data_x', borrow=True)
                self.train_data_y = theano.shared(self.train_data_yDB, name='train_data_y', borrow=True)
                self.val_data_x = theano.shared(self.val_data_xDB, name='val_data_x', borrow=True)
                self.val_data_y = theano.shared(self.val_data_yDB, name='val_data_y', borrow=True)
Example #12
0
def classall(infile,components,rotation,outfile='classout.hdf5',npoints=16384,nn=16384,lim=[12,6],end=False):
    sys.path.append('/n/ghernquist/kchua/Orbit/201-code-C-Mar2016/Python')
    import taxon
    import solve
    def calcclass(i,t,x,v,nn):
        #taxon(t,x,v,n,jsub,jdim,jcla,jcl,jpan,jlin,jcom,arch)
        tt=np.asfortranarray(t[i])
        xx=np.asfortranarray(x[i].transpose())
        vv=np.asfortranarray(v[i].transpose())*Myr
        if nn==32768:
            out=taxon32768.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test')
        elif nn==16384:
            #out=taxon.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test')
            out=taxon.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test')
        elif nn==8192:
            out=taxon8192.taxon(tt,xx,vv,nn,1,3,0,0,0,0,0,'test')
        return out
    ## N = no. of particles in each bin
    ## nn = no. of points to be used in classification
    with tables.open_file('shape.hdf5','r') as rotfile:
        rotmat=rotfile.root.rotmat[rotation]
    with tables.open_file(infile,'r') as u:
        #t=u.root.t[:]/Myr
        npart=u.root.x.shape[0]/(npoints)/6
        print 'analyzing orbits for',infile
        classout=np.zeros((npart,2))
        x=u.root.x[:].reshape(npart,npoints,6)
        t=u.root.t[:].reshape(npart,npoints)/Myr
        try:
            Ncom=u.root.NumComponents[:]
        except tables.NoSuchNodeError:
            Ncom=components
        assert Ncom in [1,2]
    if end:
        print 'using only ',end,' points'
        t=t[:end]
        x=x[:,:end,:]
    else:
        print 'using all points'

    interval=npoints/nn
    print 'interval = ', interval
    with tables.open_file(outfile,'w') as file:
        file.create_carray("/","classorb",tables.Int32Col(),(npart,2))
        file.create_carray("/","classification",tables.Int32Col(),(npart,))
        distout=file.create_carray("/","avgdist",tables.Float64Col(),(npart,2))
        file.create_carray("/","totE",tables.Float64Col(),(npart,2))
        file.create_carray("/","Ncomponents",tables.Int32Atom(),(1,))
        file.root.Ncomponents[0]=Ncom
        v=sharedmem.copy(x[:,:,3:])
        x=sharedmem.copy(x[:,:,:3])

        distout[:,0]=np.sqrt((x[:,0]**2).sum(axis=1))
        with sharedmem.MapReduce() as pool:
            def avgr(i,xx,vv):
                d=np.sqrt((xx[i]**2).sum(axis=1))
                vr=abs(np.einsum('ij,ij->i',xx[i],vv[i])/d)
                return np.sum(d/vr)/np.sum(1./vr)
            partialfunc=partial(avgr,xx=x,vv=v)
            out=pool.map(partialfunc,xrange(npart))
            distout[:,1]=out

        if Ncom==2:
            A=solve.Problem('varh',nlim=lim[0],llim=lim[1])
            B=solve.Problem('varc',nlim=lim[0],llim=lim[1])
            file.root.totE[:,0]=A.potential(x[:, 0])[:,0]+ B.potential(x[:, 0])[:,0] + \
                    (v[:,0]**2).sum(axis=1)/2.
            file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+ B.potential(x[:,-1])[:,0] + \
                    (v[:,-1]**2).sum(axis=1)/2.
            del A; del B
        elif Ncom==1:
            A=solve.Problem('var',nlim=lim[0],llim=lim[1]) #calculate initial and final energy
            file.root.totE[:,0]=A.potential(x[:,0])[:,0]+(v[:,0]**2).sum(axis=1)/2.
            file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+(v[:,-1]**2).sum(axis=1)/2.
            del A
        elif Ncom==3:
            A=solve.Problem('varc',nlim=lim[0],llim=lim[1]) #calculate initial and final energy
            file.root.totE[:,0]=A.potential(x[:,0])[:,0]+(v[:,0]**2).sum(axis=1)/2.
            file.root.totE[:,1]=A.potential(x[:,-1])[:,0]+(v[:,-1]**2).sum(axis=1)/2.
            del A
        else:
            print "Number of components not specified"

        #t_base = Array(ctypes.c_double, npart*nn)
        #tt = np.ctypeslib.as_array(t_base.get_obj()).reshape(npart,nn)
        #x_base = Array(ctypes.c_double, npart*nn*3)
        #xx = np.ctypeslib.as_array(x_base.get_obj()).reshape(npart,nn,3)
        #v_base = Array(ctypes.c_double, npart*nn*3)
        #vv = np.ctypeslib.as_array(v_base.get_obj()).reshape(npart,nn,3)
        #tt[:]=t[:,::interval][:,:nn]
        #xx[:]=x[:,::interval,:3][:,:nn]
        #vv[:]=x[:,::interval,3:][:,:nn]
        #del t
        #del x
        t=sharedmem.copy(t[:,::interval][:,:nn])
        x=x[:,::interval][:,:nn]
        v=v[:,::interval][:,:nn]
        x=np.dot(x,rotmat)
        v=np.dot(v,rotmat)
        # Use 8 threads
        #pool=Pool(8)
        with sharedmem.MapReduce() as pool:
            partialfunc=partial(calcclass,t=t,x=x,v=v,nn=nn)
            out=pool.map(partialfunc,xrange(npart))
        #pool.close()
        #pool.join()
            file.root.classorb[:]=np.array(out)

        with sharedmem.MapReduce() as pool:
            out=pool.map(getclass,file.root.classorb[:,0])
            file.root.classification[:]=out
Example #13
0
    subject2_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format(
        REPO_HOME_PATH, subject2, alignment)
    if not path.exists(subject1_file_path):
        raise ValueError(
            'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
                subject1_file_path))
    if not path.exists(subject2_file_path):
        raise ValueError(
            'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
                subject2_file_path))
    return (subject1_file_path, subject2_file_path)


if __name__ == '__main__':
    subject1_file_path, subject2_file_path = check_command_line_arguments(
        sys.argv)
    correlation_file_name = '{0}/data/processed/r_sub{1}_sub{2}_{3}'.format(
        REPO_HOME_PATH, sys.argv[2], sys.argv[3], sys.argv[1])
    if not path.exists(correlation_file_name +
                       '.npy') or not cf.USE_CACHED_DATA:
        shared_subject1 = sm.copy(np.load(subject1_file_path))
        gc.collect()
        shared_subject2 = sm.copy(np.load(subject2_file_path))
        gc.collect()
        voxel_correlations = parallelize_correlation()
        np.save(correlation_file_name, voxel_correlations)
        print('Saved {0}'.format(correlation_file_name + '.npy'))
    else:
        print('Using cached version of {0}'.format(correlation_file_name +
                                                   '.npy'))
Example #14
0
def negative_binomial(read_dict,peakfilename, swap, parameter):
    '''the main function that test for significant windows.'''
    # Initialize the parameters
    peaktype = parameter.peaktype
    threshold = parameter.threshold
    windowsize = parameter.window_size
    # Indicate the data 
    if parameter.difftest is True: 
        test_list = parameter.chip1
        control_list = parameter.chip2
    else: 
        test_list = parameter.chip1
        control_list = parameter.input1
    num_tests = parameter.get_genome_size()/windowsize
    

    #compute number of replicates
    test_rep = len(test_list)
    control_rep = len(control_list)
    start1 = 0
    end1 = start2 = test_rep
    end2 = test_rep+control_rep
    # if swap 
    if swap is True: 
        test_rep, control_rep = control_rep, test_rep
        #test_list, control_list = control_list, test_list

    # initialize basic array structures
    sig_peaks_list = []    
    
    # single-core version. 
    if parameter.num_procs <2:
        for chr in parameter.chr_info:
            read_array = read_dict[chr]
            sig_peaks_list.extend(per_chr_nbtest(read_array, chr, swap,threshold, peaktype,parameter, start1,end1,start2,end2,test_rep,control_rep))
    # multi-core version
    else: 
        result_list = []
        def log_result(result):
            result_list.append(result)
        try: 
            import sharedmem
            for chr in parameter.chr_info:
                read_array = read_dict[chr]
                read_dict[chr] = sharedmem.copy(read_array)
        except ImportError:
            print("Import sharedmem package failed")
            
        pool = multiprocessing.Pool(processes=parameter.num_procs)#,maxtasksperchild=1)
        for chr in parameter.chr_info:
            read_array = read_dict[chr]
            pool.apply_async(per_chr_nbtest, (read_array, chr, swap,threshold, peaktype, parameter.difftest, 
                       start1,end1,start2,end2,test_rep,control_rep),callback=log_result)
        pool.close()
        pool.join()
        sig_peaks_list = list(itertools.chain(*result_list))
            
    #calculate the BH FDR. 
    debug("begin estimating FDR")
    sig_peaks_list = cal_FDR(sig_peaks_list, num_tests)
    debug("finished estimating FDR")

    # merge adjacent significant peaks. 
    info ("Merging adjacent significant windows...")
    final_peak_list = []
    for chr in read_dict:
        sig_peak_list_by_chr = \
                [item for item in sig_peaks_list if item.chr==chr]
        if len(sig_peak_list_by_chr) == 0:
            continue  # if there is no significant peak in this chromosome, skip it.
        sig_index = [item.index for item in sig_peak_list_by_chr]
        sig_pval = [item.pvalue for item in sig_peak_list_by_chr]
        sig_qval = [item.qvalue for item in sig_peak_list_by_chr]
        sig_g1_count = [item.g1_count for item in sig_peak_list_by_chr]
        #print sig_g1_count, len(sig_g1_count)
        sig_g2_count = [item.g2_count for item in sig_peak_list_by_chr]
        sig_start, sig_end, sig_fc, sig_pval, sig_qval = merge_sig_window(sig_index,
                sig_g1_count, sig_g2_count, sig_pval, sig_qval, peaktype)
        for idx in range(len(sig_start)):
            final_peak = [chr, sig_start[idx]*windowsize/2, 
                          sig_end[idx]*windowsize/2+windowsize, sig_fc[idx],
                          sig_pval[idx], sig_qval[idx]]
            final_peak_list.append(final_peak) 
    # sort the peak list
    final_peak_list = sorted(final_peak_list, key=itemgetter(4))
    info("%d peaks called.", len(final_peak_list))
    if len(final_peak_list) == 0:
        return 
    #start output peaks. 
    all_fc = [peak[3] for peak in final_peak_list] 
    #print all_fc
    max_fc = max(all_fc)
    # write results to peak file.
    peakfile = open(peakfilename, 'w')
    for idx, final_peak in enumerate(final_peak_list): 
        chr = final_peak[0]
        start = final_peak[1]
        end = final_peak[2]
        fc = final_peak[3]
        pval = final_peak[4]
        qval = final_peak[5]
        # tentatively, assign the normalized fold change as the score in 5th column
        score = fc/max_fc*1000 # range from 0 to 1000
        peakfile.write( '\t'.join([chr, str(int(start)), str(int(end)), 
                        ("chip2" if swap else "chip1") + "_peak_" +str(idx+1), 
                        str(score), '.', str(fc), str(pval), str(qval)]) + '\n')
    return 
Example #15
0
def tps_multiple(
    target_words: List[str],
    word_to_int: dict,
    neighbourhood_size: int,
    words_vocabulary: Optional[list] = None,
    word_embeddings: np.ndarray = None,
    word_embeddings_normalized: np.ndarray = None,
    word_embeddings_pairwise_dists: np.ndarray = None,
    ann_instance: ApproxNN = None,
    sanity_check: bool = False,
    return_persistence_diagram: bool = False,
    n_jobs: int = 1,
    progressbar_enabled: bool = False,
    verbose: int = 1,
) -> Union[float, tuple]:
    """
    Computes the topological polysemy (TPS) [1] of words with respect
    to some word embeddings and neighbourhood size.

    Parameters
    ----------
    target_words : list of str
        Target words (w)
    word_to_int : dict of str and int
        Dictionary mapping from word to its integer representation.
    neighbourhood_size : int
        Neighbourhood size (n)
    words_vocabulary : list, optional
        List of either words (str) or r word integer representations (int), signalizing
        what part of the vocabulary we want to use (defaults to None, i.e., whole vocabulary).
    word_embeddings : np.ndarray
        Word embeddings; either word_embeddings or word_embeddings_normalized
        must be specified (defaults to None).
    word_embeddings_normalized : np.ndarray, optional
        Normalized word embeddings; either word_embeddings_normalized or word_embeddings
        must be specified (defaults to None).
    word_embeddings_pairwise_dists : np.ndarray, optional
        Numpy matrix containing pairwise distances between word embeddings
        (defaults to None).
    ann_instance : ApproxNN, optional
        Approximate nearest neighbour (ANN) instance, built on the word embeddings
        (defaults to None). If specified, the ANN index is used to find punctured
        neighbourhoods.
    sanity_check : bool, optional
        Whether or not to run sanity checks (defaults to False).
    return_persistence_diagram : bool, optional
        Whether or not to return persistence diagram (defaults to False).
    n_jobs : int, optional
        Number of processes to use (defaults to 1).
    progressbar_enabled: bool, optional
        Whether or not the progressbar is enabled (defaults to False).
    verbose : int, optional
        Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose). Defaults to 1 (verbose).

    Returns
    -------
    result : float or tuple
        TPS values of `target_words` w.r.t. word_embeddings and neighbourhood_size.
        If return_persistence_diagram is set to true, then a tuple is returned
        with the TPS values as the first value and the zero degree persistence diagram
        as the second value.

    References
    ----------
    .. [1] Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020).
       Topology of Word Embeddings: Singularities Reflect Polysemy.
    """
    tps_scores = np.zeros_like(target_words, dtype=float)
    tps_persistence_diagrams = None
    if return_persistence_diagram:
        tps_persistence_diagrams = [None] * len(target_words)

    # Only normalize word embeddings once
    if word_embeddings_normalized is None:
        if words_vocabulary is not None:
            word_vectors = words_to_vectors(
                words_vocabulary=words_vocabulary,
                word_to_int=word_to_int,
                word_embeddings=word_embeddings,
            )
        else:
            word_vectors = word_embeddings

        word_embeddings_normalized = word_vectors / np.linalg.norm(
            word_vectors, axis=1).reshape(-1, 1)
    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:

        # Prepare data for multiprocessing
        if verbose == 1:
            print("Preparing data for multiprocessing...")
        word_embeddings_normalized_shared = sharedmem.copy(
            word_embeddings_normalized)
        word_embeddings_pairwise_dists_shared = None
        if word_embeddings_pairwise_dists is not None:
            word_embeddings_pairwise_dists_shared = sharedmem.copy(
                word_embeddings_pairwise_dists)
        if verbose == 1:
            print("Done!")

        # Prepare arguments
        num_data_points_per_process = int(len(target_words) // n_jobs)
        mp_args = [(
            word_embeddings_normalized_shared,
            word_embeddings_pairwise_dists_shared,
            target_words[target_word_indices_chunk],
            target_word_indices_chunk,
            word_to_int,
            neighbourhood_size,
            sanity_check,
            return_persistence_diagram,
            progressbar_enabled,
        ) for target_word_indices_chunk in batch_list_gen(
            np.arange(len(target_words)), num_data_points_per_process)]

        # Run MP
        if verbose == 1:
            print(f"Computing TPS using {n_jobs} processes...")
        mp_var_dict["ann_instance"] = ann_instance
        with sharedmem.MapReduce(np=n_jobs) as pool:
            mp_results = pool.map(tps_multiple_by_mp_args, mp_args)
            for tps_result, target_word_indices in mp_results:
                if return_persistence_diagram:
                    tps_result_scores, tps_result_pds = tps_result
                    tps_scores[target_word_indices] = tps_result_scores
                    for i, pds_idx in enumerate(target_word_indices):
                        tps_persistence_diagrams[pds_idx] = tps_result_pds[i]
                else:
                    tps_scores[target_word_indices] = tps_result
    else:
        for i, target_word in enumerate(
                tqdm(target_words, disable=not progressbar_enabled)):
            tps_result = tps(
                target_word=target_word,
                word_to_int=word_to_int,
                neighbourhood_size=neighbourhood_size,
                word_embeddings_normalized=word_embeddings_normalized,
                word_embeddings_pairwise_dists=word_embeddings_pairwise_dists,
                ann_instance=ann_instance,
                sanity_check=sanity_check,
                return_persistence_diagram=return_persistence_diagram,
            )
            if return_persistence_diagram:
                tps_scores[i], tps_persistence_diagrams[i] = tps_result
            else:
                tps_scores[i] = tps_result

    if return_persistence_diagram:
        return tps_scores, tps_persistence_diagrams
    else:
        return tps_scores
Example #16
0
def negative_binomial(read_dict, peakfilename, swap, parameter):
    '''the main function that test for significant windows.'''
    print len(read_dict)
    # Initialize the parameters
    peaktype = parameter.peaktype
    threshold = parameter.threshold
    windowsize = parameter.window_size
    # Indicate the data
    if parameter.difftest is True:
        test_list = parameter.chip1
        control_list = parameter.chip2
    else:
        test_list = parameter.chip1
        control_list = parameter.input1
    num_tests = parameter.get_genome_size() / windowsize

    #compute number of replicates
    test_rep = len(test_list)
    control_rep = len(control_list)
    start1 = 0
    end1 = start2 = test_rep
    end2 = test_rep + control_rep
    # if swap
    if swap is True:
        test_rep, control_rep = control_rep, test_rep
        #test_list, control_list = control_list, test_list

    # initialize basic array structures
    sig_peaks_list = []

    # single-core version.
    if parameter.num_procs < 2:
        for chr in parameter.chr_info:
            read_array = read_dict[chr]
            sig_peaks_list.extend(
                per_chr_nbtest(read_array, chr, swap, threshold, peaktype,
                               parameter, start1, end1, start2, end2, test_rep,
                               control_rep))
    # multi-core version
    else:
        result_list = []

        def log_result(result):
            result_list.append(result)

        try:
            import sharedmem
            for chr in parameter.chr_info:
                read_array = read_dict[chr]
                read_dict[chr] = sharedmem.copy(read_array)
        except ImportError:
            print "Import sharedmem package failed"

        pool = multiprocessing.Pool(
            processes=parameter.num_procs)  #,maxtasksperchild=1)
        for chr in parameter.chr_info:
            read_array = read_dict[chr]
            pool.apply_async(per_chr_nbtest,
                             (read_array, chr, swap, threshold, peaktype,
                              parameter.difftest, start1, end1, start2, end2,
                              test_rep, control_rep),
                             callback=log_result)
        pool.close()
        pool.join()
        sig_peaks_list = list(itertools.chain(*result_list))

    #calculate the BH FDR.
    debug("begin estimating FDR")
    sig_peaks_list = cal_FDR(sig_peaks_list, num_tests)
    debug("finished estimating FDR")

    # merge adjacent significant peaks.
    info("Merging adjacent significant windows...")
    final_peak_list = []
    for chr in read_dict:
        sig_peak_list_by_chr = \
                [item for item in sig_peaks_list if item.chr==chr]
        if len(sig_peak_list_by_chr) == 0:
            continue  # if there is no significant peak in this chromosome, skip it.
        sig_index = [item.index for item in sig_peak_list_by_chr]
        sig_pval = [item.pvalue for item in sig_peak_list_by_chr]
        sig_qval = [item.qvalue for item in sig_peak_list_by_chr]
        sig_g1_count = [item.g1_count for item in sig_peak_list_by_chr]
        #print sig_g1_count, len(sig_g1_count)
        sig_g2_count = [item.g2_count for item in sig_peak_list_by_chr]
        sig_start, sig_end, sig_fc, sig_pval, sig_qval = merge_sig_window(
            sig_index, sig_g1_count, sig_g2_count, sig_pval, sig_qval,
            peaktype)
        for idx in range(len(sig_start)):
            final_peak = [
                chr, sig_start[idx] * windowsize / 2,
                sig_end[idx] * windowsize / 2 + windowsize, sig_fc[idx],
                sig_pval[idx], sig_qval[idx]
            ]
            final_peak_list.append(final_peak)
    # sort the peak list
    final_peak_list = sorted(final_peak_list, key=itemgetter(4))
    info("%d peaks called.", len(final_peak_list))
    if len(final_peak_list) == 0:
        return
    #start output peaks.
    all_fc = [peak[3] for peak in final_peak_list]
    #print all_fc
    max_fc = max(all_fc)
    # write results to peak file.
    peakfile = open(peakfilename, 'w')
    for idx, final_peak in enumerate(final_peak_list):
        chr = final_peak[0]
        start = final_peak[1]
        end = final_peak[2]
        fc = final_peak[3]
        pval = final_peak[4]
        qval = final_peak[5]
        # tentatively, assign the normalized fold change as the score in 5th column
        score = fc / max_fc * 1000  # range from 0 to 1000
        peakfile.write('\t'.join([
            chr,
            str(start),
            str(end), ("chip2" if swap else "chip1") + "_peak_" + str(idx + 1),
            str(score), '.',
            str(fc),
            str(pval),
            str(qval)
        ]) + '\n')
    return
Example #17
0
def compute_gad(
    data_points: np.ndarray,
    manifold_dimension: int,
    annulus_inner_radius: float = None,
    annulus_outer_radius: float = None,
    data_point_ints: list = None,
    data_points_pairwise_distances: np.ndarray = None,
    data_points_approx_nn: ApproxNN = None,
    data_points_distance_metric: Callable = fastdist.euclidean,
    use_ripser_plus_plus: bool = False,
    ripser_plus_plus_threshold: int = 200,
    use_knn_annulus: bool = False,
    knn_annulus_inner: int = None,
    knn_annulus_outer: int = None,
    knn_annulus_metric: Callable = fastdist.euclidean,
    knn_annulus_metric_name: str = "euclidean",
    return_annlus_persistence_diagrams: bool = False,
    progressbar_enabled: bool = False,
    n_jobs: int = 1,
    verbose: int = 1,
) -> dict:
    """
    Computes geometric anomaly detection (GAD) Procedure 1 from [1].

    Parameters
    ----------
    data_points : np.ndarray
        All data points.
    manifold_dimension : int
        Manifold homology dimension (k parameter in [1]).
    annulus_inner_radius : float
        Inner annulus radius.
    annulus_outer_radius : float
        Outer annulus radius.
    data_point_ints : np.ndarray
        Array specifying which data point indices are used from all the data points.
    data_points_pairwise_distances : np.ndarray, optional
        Pairwise distances of data points (defaults to None).
    data_points_approx_nn : ApproxNN, optional
        ApproxNN instance (defaults to None).
    data_points_distance_metric : Callable, optional
        Distance metric callable to compute exact distance between any two data
        points (defaults to euclidean distance, `fastdist.euclidean`).
    use_ripser_plus_plus : bool
        Whether or not to use Ripser++ (GPU acceleration).
    ripser_plus_plus_threshold : int
        The least number of data points in order to use Ripser++, only has an effect
        if `use_ripser_plus_plus` is set to True.
    use_knn_annulus : bool
        Whether or not to use the KNN verison of GAD.
    knn_annulus_inner : int
        Number of neighbours to determine inner annulus radius.
    knn_annulus_outer : int
        Number of neighbours to determine outer annulus radius.
    knn_annulus_metric : Callable
        fastdist metric; only required if `data_points_pairwise_distances` and
        `data_points_approx_nn` are None (defaults to fastdist.euclidean).
    knn_annulus_metric_name : str
        String name of the `knn_annulus_metric` callable (defaults to "euclidean").
    return_annlus_persistence_diagrams : bool
        Whether or not to return annulus persistence diagrams.
    progressbar_enabled : bool
        Whether or not the tqdm progressbar is enabled.
    n_jobs : int, optional
        Number of processes to use (defaults 1, -1 denotes all processes).
    verbose : int, optional
        Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose). Defaults to 1 (verbose).

    Returns
    -------
    result : dict
        Result dictionary consisting of:
            "P_man" : list
                List of point indices of k-manifold points.
            "P_bnd" : list
                List of point indices of boundary points.
            "P_int" : list
                List of point indices of intersection points.
            "annlus_persistence_diagrams" : list
                List of persistence diagrams of annulus points, if
                `return_annlus_persistence_diagrams` is set to True.

    References
    ----------
    .. [1] Bernadette J Stolz, Jared Tanner, Heather A Harrington, & Vidit Nanda.
       (2019). Geometric anomaly detection in data.
    """
    if data_point_ints is None:
        data_point_ints = np.arange(len(data_points))

    # Get distance function
    distance_func = get_point_distance_func(
        data_points=data_points,
        pairwise_distances=data_points_pairwise_distances,
        metric_callable=data_points_distance_metric,
    )

    # Get KNN annulus function, use_knn_annulus is True
    knn_func = None
    if use_knn_annulus:
        knn_func = get_knn_func_data_points(
            data_points=data_points,
            pairwise_distances=data_points_pairwise_distances,
            approx_nn=data_points_approx_nn,
            metric=knn_annulus_metric,
            metric_name=knn_annulus_metric_name,
        )

    target_homology_dim = manifold_dimension - 1
    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:

        # Initialize MP results
        results = {
            "P_bnd": [],
            "P_man": [],
            "P_int": [],
        }
        if return_annlus_persistence_diagrams:
            results["annulus_pds"] = {}

        # Prepare data for multiprocessing
        if verbose == 1:
            print("Preparing data for multiprocessing...")
        data_points_shared = sharedmem.copy(data_points)
        # data_points_raw = Array(
        #     "d", data_points.shape[0] * data_points.shape[1], lock=False
        # )
        # data_points_raw_np = np.frombuffer(data_points_raw).reshape(data_points.shape)
        # np.copyto(data_points_raw_np, data_points)
        if verbose == 1:
            print("Done!")

        # Prepare arguments
        num_data_points_per_process = int(len(data_point_ints) // n_jobs)
        mp_args = [
            (
                data_points_shared,
                data_point_ints_chunk,
                annulus_inner_radius,
                annulus_outer_radius,
                use_knn_annulus,
                knn_annulus_inner,
                knn_annulus_outer,
                target_homology_dim,
                use_ripser_plus_plus,
                ripser_plus_plus_threshold,
                return_annlus_persistence_diagrams,
            )
            for data_point_ints_chunk in batch_list_gen(
                data_point_ints, num_data_points_per_process
            )
        ]
        mp_var_dict["distance_func"] = distance_func
        if knn_func is not None:
            mp_var_dict["knn_func"] = knn_func

        # Run MP
        if verbose == 1:
            print(f"Computing GAD using {n_jobs} processes...")
        with sharedmem.MapReduce(np=n_jobs) as pool:
            mp_results = pool.map(compute_gad_point_indices_mp, mp_args)
            for result in mp_results:
                results["P_man"].extend(result["P_man"])
                results["P_bnd"].extend(result["P_bnd"])
                results["P_int"].extend(result["P_int"])
                if return_annlus_persistence_diagrams:
                    results["annulus_pds"].update(result["annulus_pds"])

        # with Pool(
        #     processes=n_jobs,
        #     initializer=compute_gad_mp_init,
        #     initargs=(data_points_raw_np, data_points.shape, distance_func, knn_func),
        # ) as pool:
        #     for result in tqdm(
        #         pool.imap_unordered(compute_gad_point_indices_mp, grid_search_args),
        #         total=n_jobs,
        #         disable=not progressbar_enabled,
        #     ):
        #         results["P_man"].extend(result["P_man"])
        #         results["P_bnd"].extend(result["P_bnd"])
        #         results["P_int"].extend(result["P_int"])
        #         if return_annlus_persistence_diagrams:
        #             results["annulus_pds"].update(result["annulus_pds"])
    else:

        # Compute GAD using only one processor
        if verbose == 1:
            print("Computing GAD...")
        results = compute_gad_point_indices(
            data_point_indices=data_point_ints,
            data_points=data_points,
            annulus_inner_radius=annulus_inner_radius,
            annulus_outer_radius=annulus_outer_radius,
            distance_func=distance_func,
            use_knn_annulus=use_knn_annulus,
            knn_func=knn_func,
            knn_annulus_inner=knn_annulus_inner,
            knn_annulus_outer=knn_annulus_outer,
            target_homology_dim=target_homology_dim,
            use_ripser_plus_plus=use_ripser_plus_plus,
            ripser_plus_plus_threshold=ripser_plus_plus_threshold,
            return_annlus_persistence_diagrams=return_annlus_persistence_diagrams,
            progressbar_enabled=progressbar_enabled,
        )

    return results