def describe(self) -> str: text = f"SingleCellOMICs: {self.name}" pad = "\n " for omic in self.omics: X = self.numpy(omic) all_nonzeros = [] for s, e in batching(n=self.n_obs, batch_size=BATCH_SIZE): x = X[s:e] ids = np.nonzero(x) all_nonzeros.append(x[ids[0], ids[1]]) all_nonzeros = np.concatenate(all_nonzeros) text += pad[:-1] + "OMIC: '%s' - dtype: '%s'" % ( omic.name, "binary" if self.is_binary(omic) else "continuous") text += pad + 'Sparsity : %.2f' % self.sparsity(omic) text += pad + 'Nonzeros : %s' % describe( all_nonzeros, shorten=True, float_precision=2) text += pad + 'Cell : %s' % describe( self.counts_per_cell(omic), shorten=True, float_precision=2) text += pad + 'Gene : %s' % describe( self.counts_per_gene(omic), shorten=True, float_precision=2) text += pad + 'LogCount : %s' % describe( self.log_counts(omic), shorten=True, float_precision=2) text += pad + 'LocalMean : %s' % describe( self.local_mean(omic), shorten=True, float_precision=2) text += pad + 'LocalVar : %s' % describe( self.local_var(omic), shorten=True, float_precision=2) return text
X, y = X[perm], y[perm] X_train, y_train = X[:int(0.8 * n)], y[:int(0.8 * n)] X_test, y_test = X[int(0.8 * n):], y[int(0.8 * n):] elif 'X_train' in ds and 'X_test' in ds: X_train, y_train = ds['X_train'], ds['y_train'] X_test, y_test = ds['X_test'], ds['y_test'] else: raise RuntimeError('No support for dataset: "%s"' % args.ds) # ====== post processing ====== # if y_train.ndim > 1: y_train = np.argmax(y_train, axis=-1) if y_test.ndim > 1: y_test = np.argmax(y_test, axis=-1) input_shape = (None, ) + X_train.shape[1:] n_classes = len(np.unique(y_train)) print("Train:", ctext(X_train.shape, 'cyan'), describe(X_train, shorten=True)) print("Test :", ctext(X_test.shape, 'cyan'), describe(X_test, shorten=True)) # ====== create basic tensor ====== # X = K.placeholder(shape=(None, ) + input_shape[1:], name='X') W = K.placeholder(shape=(None, ) + input_shape[1:], name='W') y = K.placeholder(shape=(None, ), name='y') nsample = K.placeholder(shape=(), dtype='int32', name='nsample') # =========================================================================== # Create the network # =========================================================================== index = [0] def dense_creator(): net = N.Sequence([ N.Dense(int(args.hdim),
override=True, identifier='name', log_path=os.path.join( EXP_DIR, 'processor.log'), stop_on_failure=True) processor.run() # pp.validate_features(processor, # nb_samples=12, # path=os.path.join(EXP_DIR, 'feature_validation'), # override=True) ds = F.Dataset(PATH_ACOUSTIC_FEATURES, read_only=True) print(ds) indices = list(ds['indices_%s' % args.feat].items()) print("Utterances length:") print(" ", describe([end - start for name, (start, end) in indices], shorten=True)) # =========================================================================== # Basic path for GMM, T-matrix and I-vector # =========================================================================== EXP_DIR = os.path.join(EXP_DIR, '%s_%d_%d' % (FEAT, NMIX, TV_DIM)) LOG_PATH = get_logpath(name='log.txt', override=False, root=EXP_DIR, odin_base=False) stdio(LOG_PATH) print("Exp-dir:", ctext(EXP_DIR, 'cyan')) print("Log path:", ctext(LOG_PATH, 'cyan')) # ====== ivec path ====== # GMM_PATH = os.path.join(EXP_DIR, 'gmm') TMAT_PATH = os.path.join(EXP_DIR, 'tmat') # zero order statistics
print(protein.shape) print(protein_name) # =========================================================================== # Probabilistic Embedding # =========================================================================== pb = ProbabilisticEmbedding(n_components_per_class=2, positive_component=1, log_norm=True, clip_quartile=0., remove_zeros=True, ci_threshold=-0.68, random_state=5218, verbose=True) pb.fit(protein) # binarize the protein matrix y_bin = pb.predict(protein) print(describe(y_bin)) # probabilize the protein matrix y_prob = pb.predict_proba(protein) print(describe(y_prob)) # ====== save the analysis and diagnosis ====== # pb.boxplot(protein, protein_name).plot_diagnosis( protein, protein_name).plot_distribution(protein, protein_name).save_figures(FIGURE_PATH, verbose=True)
dict(ds['path'].items()), ] # ====== print log ====== # print("Acoustic features:") for dsname, (X, indices, y, path) in sorted(acoustic_features.items(), key=lambda x: x[0]): all_utt_length = dict([(name, end - start) for name, (start, end) in indices.items()]) print(" %s" % ctext(dsname, 'yellow')) print(" #Files :", ctext(len(indices), 'cyan')) print(" #Noise : %s/%s" % (ctext(len([i for i in indices if '/' in i]), 'lightcyan'), ctext(len(indices), 'cyan'))) print(" Loaded features:", ctext(X.path, 'cyan')) print(" Utt length :", describe(list(all_utt_length.values()), shorten=True)) print(" Min length(+8) :") min_length = min(all_utt_length.values()) for name, length in all_utt_length.items(): if length <= min_length + 8: print(' %s | %s' % (name.split('/')[0], path[name])) # =========================================================================== # All system must extract following information # =========================================================================== # mapping from # dataset_name -> 'name': 1-D array [n_samples], # # (path to original audio) # 'path': 1-D array [n_samples], # # Extracted latent vectors # 'X': 2-D array [n_samples, n_latent_dim]} # # speaker label or meta-data (e.g. 'test', 'enroll', 'unlabeled')
def main(y_prot, y_prot_names, n_components=2, index=1, log_norm=True, clip_quartile=0., remove_zeros=True, ci_threshold=-0.68, outpath=None, figpath=None, verbose=False): if outpath is not None: bin_path = os.path.join(outpath, 'y_bin') prob_path = os.path.join(outpath, 'y_prob') if verbose: print("Start label thresholding:") print(" Output path:", ctext(outpath, 'yellow')) print(" Figure path:", ctext(figpath, 'yellow')) # ====== protein.count ====== # if verbose: print(" Protein labels:", ctext(', '.join(y_prot_names), 'cyan')) print(" Protein matrix:", ctext(y_prot.shape, 'cyan')) # ====== already binarized ====== # if len(np.unique(y_prot)) == 2: warnings.warn("y is already binarized!") exit() # ====== PB ====== # pb = ProbabilisticEmbedding(n_components_per_class=n_components, positive_component=index, log_norm=log_norm, clip_quartile=clip_quartile, remove_zeros=remove_zeros, ci_threshold=ci_threshold, verbose=verbose) pb.fit(y_prot) y_bin = pb.predict(y_prot) y_prob = pb.predict_proba(y_prot) if verbose: print(" Thresholded values:") print(" Original :", ctext(describe(y_prot, shorten=True), 'lightcyan')) print(" Binarized :", ctext(describe(y_bin, shorten=True), 'lightcyan')) print(" Probabilities:", ctext(describe(y_prob, shorten=True), 'lightcyan')) # ====== save the results ====== # if outpath is not None: with open(bin_path, 'wb') as f: pickle.dump(y_bin, f) if verbose: print(" Save binarized data to:", ctext(bin_path, 'yellow')) with open(prob_path, 'wb') as f: pickle.dump(y_prob, f) if verbose: print(" Save probabilized data to:", ctext(prob_path, 'yellow')) # ====== save figure ====== # if figpath is not None: pb.boxplot(y_prot, y_prot_names).plot_diagnosis( y_prot, y_prot_names).plot_distribution( y_prot, y_prot_names).save_figures(path=figpath, verbose=verbose)
nb = NegativeBinomialDisp(loc=mean, disp=disp_row) llk1 = tf.reduce_sum(nb.log_prob(x), axis=1).numpy() llk2 = log_nb_positive(x=torch.Tensor(x), mu=torch.Tensor(mean), theta=torch.Tensor(disp_row)).numpy() print(np.all(np.isclose(llk1, llk2))) except: print("NOT POSSIBLE TO BROADCAST the first dimension") # all disp available nb = NegativeBinomialDisp(loc=mean, disp=disp) llk1 = tf.reduce_sum(nb.log_prob(x), axis=1).numpy() llk2 = log_nb_positive(x=torch.Tensor(x), mu=torch.Tensor(mean), theta=torch.Tensor(disp)).numpy() print(np.all(np.isclose(llk1, llk2))) s1 = nb.sample().numpy() s2 = torch_nb(mean, disp).numpy() print(describe(s1)) print(describe(s2)) zinb = ZeroInflated(nb, probs=pi) llk1 = tf.reduce_sum(zinb.log_prob(x), axis=1).numpy() llk2 = log_zinb_positive(x=torch.Tensor(x), mu=torch.Tensor(mean), theta=torch.Tensor(disp), pi=torch.Tensor(pi)).numpy() print(llk1) print(llk2)
n_cache=120, ncpu=None, override=True, identifier='name', log_path=os.path.join(EXP_DIR, 'processor.log'), stop_on_failure=True) processor.run() # pp.validate_features(processor, # nb_samples=12, # path=os.path.join(EXP_DIR, 'feature_validation'), # override=True) ds = F.Dataset(PATH_ACOUSTIC_FEATURES, read_only=True) print(ds) indices = list(ds['indices_%s' % args.feat].items()) print("Utterances length:") print(" ", describe([end - start for name, (start, end) in indices], shorten=True)) # =========================================================================== # Basic path for GMM, T-matrix and I-vector # =========================================================================== EXP_DIR = os.path.join(EXP_DIR, '%s_%d_%d' % (FEAT, NMIX, TV_DIM)) LOG_PATH = get_logpath(name='log.txt', override=False, root=EXP_DIR, odin_base=False) stdio(LOG_PATH) print("Exp-dir:", ctext(EXP_DIR, 'cyan')) print("Log path:", ctext(LOG_PATH, 'cyan')) # ====== ivec path ====== # GMM_PATH = os.path.join(EXP_DIR, 'gmm') TMAT_PATH = os.path.join(EXP_DIR, 'tmat') # zero order statistics Z_PATH = ( os.path.join(EXP_DIR, 'Z_train'), os.path.join(EXP_DIR, 'Z_test'))