def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
class f_regressionFDRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR') self.id = 34 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
class UnivariateSelectChiFDRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR') self.id = 31 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def test_select_fdr_int(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fdr", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.int64), model, model_onnx, basename="SklearnSelectFdr")
def test_select_fdr_int(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fdr', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.int64), model, model_onnx, basename="SklearnSelectFdr", allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fdr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fdr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives) return false_discovery_rate
def test_select_fdr_float(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fdr", [("input", FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSelectFdr", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def feature_SelectFdr(x_data, y_data): bestfeatures = SelectFdr(f_classif, alpha=0.01) fit = bestfeatures.fit(x_data, y_data) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x_data.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def select_f(self, score_function, has_pvalue): """ Select features using FDR (False Discovery Rate) or K-best """ fname = score_function.__name__ if has_pvalue: self._debug(f"Select FDR: '{fname}'") select = SelectFdr(score_func=score_function) else: self._debug(f"Select K-Best: '{fname}'") select = SelectKBest(score_func=score_function, k='all') self._debug( f"Select '{fname}': x.shape={self.x.shape}, y.shape={self.y.shape}" ) select.fit(self.x, self.y) keep = select.get_support() if has_pvalue: return (fname, select.scores_, select.pvalues_, keep) else: return (fname, select.scores_, None, None)
def svm_cv(data, data_target): X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target) print "*" * 79 print "Training..." # selector = SelectFdr(chi2) selector = SelectFdr(f_classif) selector.fit(X_train, y_train) clf = svm.SVC(kernel='linear', probability=True) clf.fit(selector.transform(X_train), y_train) print "Testing..." pred = clf.predict(selector.transform(X_test)) probs = pred.predict_proba(selector.transfrom(X_test)) accuracy_score = metrics.accuracy_score(y_test, pred) classification_report = metrics.classification_report(y_test, pred) support = selector.get_support() print support print accuracy_score print classification_report precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
def SelectComorbidTraits(self,FDR,modifyDataset=False,useChi2=True): """ Selects features (symptoms) correlated with some dichotomous variable (disease diagnosis), hence co-morbid. This dichotomous variable is automatically inferred from ClinicalDatasetSampler, as it is whatever the sampler is conditioned on. Parameters ---------- FDR : float False discovery rate cut off for feature selection modifyDataset : bool If True, then features that faile to be selected will be dropped from the dataset. useChi2 : bool By default, uses chi-sq test to estimate co-morbidity between featureVector and features. If False, then Fisher's exact test is used. Returns ------- tuple of arrays (Index of selected features, Feature Scores ,Feature P-values) """ assert self.sampler.isConditioned==True,"Cannot perform feature selection without being conditioned on some disease of interest" previousArrayType = self.sampler.returnArrays if self.sampler.returnArrays!='Sparse': self.sampler.ChangeArrayType('Sparse') sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False) dataMatrix=sparseTrainingData[0] incidenceVec =sparseTrainingData[2] if useChi2==False: fdr=SelectFdr(fisher_exact, alpha=FDR) else: fdr=SelectFdr(chi2, alpha=FDR) fdr_fit = fdr.fit(dataMatrix,incidenceVec.toarray()) discIndx=np.where(fdr_fit.get_support()==True)[0] if modifyDataset: self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx]) if previousArrayType!='Sparse': self.sampler.ChangeArrayType(previousArrayType) return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def SelectComorbidTraits_ContinuousFeature(self,featureVector,FDR,modifyDataset=False,use_ttest=False): """ Selects features correlated with some continuous variable. Parameters ---------- featureVector : [float] Vector of floating values for feature selection. Must be sorted in the same order as the index for the ClinicalDatasetSampler training dataset. FDR : float False discovery rate cut off for feature selection modifyDataset : bool If True, then features that faile to be selected will be dropped from the dataset. use_ttest : bool By default, uses F-test to estimate correlation between featureVector and features. If True, instead uses T-test to perform association. Returns ------- tuple of arrays (Index of selected features, Feature Scores ,Feature P-values) """ previousArrayType = self.sampler.returnArrays if self.sampler.returnArrays!='Sparse': self.sampler.ChangeArrayType('Sparse') sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False) dataMatrix=sparseTrainingData[0] if use_ttest: fdr=SelectFdr(T_test, alpha=FDR) else: fdr=SelectFdr(f_regression, alpha=FDR) fdr_fit = fdr.fit(dataMatrix,featureVector.ravel()) discIndx=np.where(fdr_fit.get_support()==True)[0] if modifyDataset: self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx]) if previousArrayType!='Sparse': self.sampler.ChangeArrayType(previousArrayType) return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
###### MASKING FOR SELECTED STIMS targetNames = ['bottle', 'face', 'scissors'] # the ttims of interest stimMask = targetData.labels.isin(targetNames) # indices for the stim of interest X_fMRI_selected = X_fMRI[stimMask] # features (for selected stimuli only) y = np.array(targetData.labelInd)[stimMask] # labels ###### FEATURE SELECTION # FDR feature selector selector = SelectFdr(f_classif, alpha=0.01) # FDR selector object selector.fit(X_fMRI_selected, y) # learning from the data X = selector.transform(X_fMRI_selected) # Selected features only indVoxels = selector.get_support(indices=True) # indices of surviving voxels ###### VISUALIZING FEATURE LOCATIONS # binary vector with 1s indicating selected voxels bROI = np.zeros(X_fMRI.shape[-1]) bROI[indVoxels] = 1 # reverse masking bROI_img = masker.inverse_transform(bROI) # Create the figure plot_stat_map(bROI_img, imgAnat, title='Voxels surviving FDR')
def remove_drugs_with_low_effect_univariate( feat, meta, threshold=0.05, fdr=0.05, test_each_dose=False, keep_names=['DMSO', 'NoCompound'], return_nonsignificant=False, drugname_column = 'drug_type', drugdose_column = 'drug_dose' ): """ Remove drugs when the number of features significantly different to DMSO for any dose is lower than the threshold. The statistical significance of the difference between a compound dose and the DMSO is assessed based on individual ANOVA tests for each feature. The Benjamini-Hochberg method is used to control the false discovery rate. param: feat : dataframe feature dataframe meta : dtaframe dataframe with metadata threshold : float < 1.0 and < 0.0 percentage of significant features detected to consider that the compound has significant effect fdr : float < 1.0 and > 0.0 false discovery rate parameter in Benjamini-Hochberg method test_each_dose : bool, optional If true, each dose of each drug is tested for statistical significance compared to DMSO, and the drug is considered to have a significant effect if any of the doses satisfies the conditions set by the fdr and threshold parameters. If False, an ANOVA test is performed comparing the DMSO with all the doses (as separate classes) and the conditions are checked once for each drug. keep_names : list or None, optional list of names from the drugname_column to keep without checking for significance return_nonsignificant : bool, optional return the names of the drugs that are removed from the dataset drugname_column : string the name of the column in meta that contains the individual compound names drugdose_column : string the name of the column in meta that contains the drug doses return: feat = feature dataframe with low-potency drugs removed samples = dataframe with sample identification data corresponding to returned feat dataframe """ import numpy as np from sklearn.feature_selection import SelectFdr, f_classif import pdb n_feat = feat.shape[1] drug_names = meta[drugname_column].unique() significant_drugs = [] for idrug,drug in enumerate(drug_names): if drug in keep_names: continue # For each dose get significant features using Benjamini-Hochberg # method with FDR=fdr X = feat[meta[drugname_column].isin([drug,'DMSO'])] y = meta.loc[meta[drugname_column].isin([drug,'DMSO']), drugdose_column] selector = SelectFdr(score_func=f_classif, alpha=fdr) if not test_each_dose: try: selector.fit(X, y) except ValueError: pdb.set_trace() n_sign_feat = np.sum(selector.get_support()) if n_sign_feat > threshold*n_feat: significant_drugs.append(drug) else: n_sign_feat = [] for idose, dose in enumerate(y.unique()): if dose == 0: continue selector.fit(X[np.isin(y,[0,dose])], y[np.isin(y,[0,dose])]) n_sign_feat.append(np.sum(selector.get_support())) if np.any([n>threshold*n_feat for n in n_sign_feat]): significant_drugs.append(drug) # If DMSO was in drug list, include it in the final dataframe # (default behaviour) if keep_names is not None: significant_drugs.extend(keep_names) feat = feat[meta[drugname_column].isin(significant_drugs)] meta = meta[meta[drugname_column].isin(significant_drugs)] if return_nonsignificant: return feat, meta, list( set(drug_names).difference(set(significant_drugs)) ) else: return feat, meta
import sys import numpy as np import tools.formatter import matplotlib.pyplot as plt from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from sklearn import preprocessing, svm from sklearn.feature_selection import SelectFdr, f_classif, f_regression features_train, features_test, labels_train, labels_test, features, labels, data_dict = tools.formatter.preprocess( './datasets/fertility.txt') scaler = preprocessing.StandardScaler().fit(features) selector = SelectFdr(f_classif) selector.fit(scaler.transform(features), np.array(labels)) features_names = data_dict.keys() scores = -np.log10(selector.pvalues_) plt.bar(range(len(features_names)), np.array(scores)) plt.xticks(range(len(features_names)), features_names, rotation='vertical') plt.tight_layout() plt.show()
ft_model = load_embedding(FLAGS.embedfile) docs = [c.split(' ') for c in comments_text] for i in range(len(docs)): docs[i] = [t for t in docs[i] if t in ft_model.vocab] print('Building dictionary...') comments_dictionary = Dictionary(docs) comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = corpus2csc(comments_tfidf).T print('Finding important terms...') labelcols = data.columns.tolist()[2:] terms = Counter() for l in labelcols: cl = data[l] model_fdr = SelectFdr(chi2, alpha=0.025) model_fdr.fit(comments_vecs, cl) ids = model_fdr.get_support(indices=True) for i in ids: terms[comments_dictionary[i]] += model_fdr.scores_[i] print('Saving results...') with open(FLAGS.chi2file, 'wb') as f: pickle.dump(terms, f, protocol=pickle.HIGHEST_PROTOCOL)
def run_main(args): ################################################# START SECTION OF LOADING PARAMETERS ################################################# # Read parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 na = args.missing_value data_path = DATA_MAP[args.target_data] test_size = args.test_size select_drug = args.drug freeze = args.freeze_pretrain valid_size = args.valid_size g_disperson = args.var_genes_disp min_n_genes = args.min_n_genes max_n_genes = args.max_n_genes source_model_path = args.source_model_path target_model_path = args.target_model_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.source_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) source_data_path = args.source_data pretrain = args.pretrain prediction = args.predition data_name = args.target_data label_path = args.label_path reduce_model = args.dimreduce predict_hdims = args.p_h_dims.split(",") predict_hdims = list(map(int, predict_hdims)) leiden_res = args.cluster_res load_model = bool(args.load_target_model) # Misc now = time.strftime("%Y-%m-%d-%H-%M-%S") # Initialize logging and std out out_path = log_path + now + ".err" log_path = log_path + now + ".log" out = open(out_path, "w") sys.stderr = out #Logging infomaion logging.basicConfig( level=logging.INFO, filename=log_path, filemode='a', format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # Save arguments args_df = ut.save_arguments(args, now) ################################################# END SECTION OF LOADING PARAMETERS ################################################# ################################################# START SECTION OF SINGLE CELL DATA REPROCESSING ################################################# # Load data and preprocessing adata = pp.read_sc_file(data_path) if data_name == 'GSE117872': adata = ut.specific_process(adata, dataname=data_name, select_origin=args.batch_id) elif data_name == 'GSE122843': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE110894': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE112274': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE116237': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE108383': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE140440': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE129730': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE149383': adata = ut.specific_process(adata, dataname=data_name) else: adata = adata sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata = pp.cal_ncount_ngenes(adata) # Show statisctic after QX sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt-'], jitter=0.4, multi_panel=True, save=data_name, show=False) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt-', show=False) sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', show=False) if args.remove_genes == 0: r_genes = [] else: r_genes = REMOVE_GENES #Preprocess data by filtering if data_name not in ['GSE112274', 'GSE140440']: adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, filter_mingenes=args.min_g, normalize=True, log=True, remove_genes=r_genes) else: adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, percent_mito=100, filter_mingenes=args.min_g, normalize=True, log=True, remove_genes=r_genes) # Select highly variable genes sc.pp.highly_variable_genes(adata, min_disp=g_disperson, max_disp=np.inf, max_mean=6) sc.pl.highly_variable_genes(adata, save=data_name, show=False) adata.raw = adata adata = adata[:, adata.var.highly_variable] # Preprocess data if spcific process is required data = adata.X # PCA # Generate neighbor graph sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10) # Generate cluster labels sc.tl.leiden(adata, resolution=leiden_res) sc.tl.umap(adata) sc.pl.umap(adata, color=['leiden'], save=data_name + 'umap' + now, show=False) adata.obs['leiden_origin'] = adata.obs['leiden'] adata.obsm['X_umap_origin'] = adata.obsm['X_umap'] data_c = adata.obs['leiden'].astype("long").to_list() ################################################# END SECTION OF SINGLE CELL DATA REPROCESSING ################################################# ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# #Prepare to normailize and split target data mmscaler = preprocessing.MinMaxScaler() try: data = mmscaler.fit_transform(data) except: logging.warning("Only one class, no ROC") # Process sparse data data = data.todense() data = mmscaler.fit_transform(data) # Split data to train and valid set # Along with the leiden conditions for CVAE propose Xtarget_train, Xtarget_valid, Ctarget_train, Ctarget_valid = train_test_split( data, data_c, test_size=valid_size, random_state=42) # Select the device of gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) try: torch.cuda.set_device(device) except: logging.warning("No GPU detected, will apply cpu to process") # Construct datasets and data loaders Xtarget_trainTensor = torch.FloatTensor(Xtarget_train).to(device) Xtarget_validTensor = torch.FloatTensor(Xtarget_valid).to(device) # Use leiden label if CVAE is applied Ctarget_trainTensor = torch.LongTensor(Ctarget_train).to(device) Ctarget_validTensor = torch.LongTensor(Ctarget_valid).to(device) X_allTensor = torch.FloatTensor(data).to(device) C_allTensor = torch.LongTensor(data_c).to(device) train_dataset = TensorDataset(Xtarget_trainTensor, Ctarget_trainTensor) valid_dataset = TensorDataset(Xtarget_validTensor, Ctarget_validTensor) Xtarget_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) Xtarget_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) dataloaders_pretrain = { 'train': Xtarget_trainDataLoader, 'val': Xtarget_validDataLoader } ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# ################################################# START SECTION OF LOADING BULK DATA ################################################# # Read source data data_r = pd.read_csv(source_data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) # Extract labels selected_idx = label_r.loc[:, select_drug] != na label = label_r.loc[selected_idx, select_drug] label = label.values.reshape(-1, 1) if prediction == "regression": lbscaler = preprocessing.MinMaxScaler() label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = preprocessing.LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 # Process source data mmscaler = preprocessing.MinMaxScaler() source_data = mmscaler.fit_transform(data_r) # Split source data Xsource_train_all, Xsource_test, Ysource_train_all, Ysource_test = train_test_split( source_data, label, test_size=test_size, random_state=42) Xsource_train, Xsource_valid, Ysource_train, Ysource_valid = train_test_split( Xsource_train_all, Ysource_train_all, test_size=valid_size, random_state=42) # Transform source data # Construct datasets and data loaders Xsource_trainTensor = torch.FloatTensor(Xsource_train).to(device) Xsource_validTensor = torch.FloatTensor(Xsource_valid).to(device) if prediction == "regression": Ysource_trainTensor = torch.FloatTensor(Ysource_train).to(device) Ysource_validTensor = torch.FloatTensor(Ysource_valid).to(device) else: Ysource_trainTensor = torch.LongTensor(Ysource_train).to(device) Ysource_validTensor = torch.LongTensor(Ysource_valid).to(device) sourcetrain_dataset = TensorDataset(Xsource_trainTensor, Ysource_trainTensor) sourcevalid_dataset = TensorDataset(Xsource_validTensor, Ysource_validTensor) Xsource_trainDataLoader = DataLoader(dataset=sourcetrain_dataset, batch_size=batch_size, shuffle=True) Xsource_validDataLoader = DataLoader(dataset=sourcevalid_dataset, batch_size=batch_size, shuffle=True) dataloaders_source = { 'train': Xsource_trainDataLoader, 'val': Xsource_validDataLoader } ################################################# END SECTION OF LOADING BULK DATA ################################################# ################################################# START SECTION OF MODEL CUNSTRUCTION ################################################# # Construct target encoder if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) loss_function_e = nn.MSELoss() elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) elif reduce_model == "CVAE": # Number of condition is equal to the number of clusters encoder = CVAEBase(input_dim=data.shape[1], n_conditions=len(set(data_c)), latent_dim=dim_au_out, h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info("Target encoder structure is: ") logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # Load source model before transfer if prediction == "regression": dim_model_out = 1 else: dim_model_out = 2 # Load AE model if reduce_model == "AE": source_model = PretrainedPredictor(input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model # Load VAE model elif reduce_model in ["VAE", "CVAE"]: source_model = PretrainedVAEPredictor( input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze, z_reparam=bool(args.VAErepram)) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model logging.info("Load pretrained source model from: " + source_model_path) source_encoder.to(device) ################################################# END SECTION OF MODEL CUNSTRUCTION ################################################# ################################################# START SECTION OF SC MODEL PRETRAININIG ################################################# # Pretrain target encoder # Pretain using autoencoder is pretrain is not False if (str(pretrain) != '0'): # Pretrained target encoder if there are not stored files in the harddisk train_flag = True pretrain = str(pretrain) if (os.path.exists(pretrain) == True): try: encoder.load_state_dict(torch.load(pretrain)) logging.info("Load pretrained target encoder from " + pretrain) train_flag = False except: logging.warning("Loading failed, procceed to re-train model") if train_flag == True: if reduce_model == "AE": encoder, loss_report_en = t.train_AE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "VAE": encoder, loss_report_en = t.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "CVAE": encoder, loss_report_en = t.train_CVAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) logging.info("Pretrained finished") # Before Transfer learning, we test the performance of using no transfer performance: # Use vae result to predict if (args.dimreduce != "CVAE"): embeddings_pretrain = encoder.encode(X_allTensor) else: embeddings_pretrain = encoder.encode(X_allTensor, C_allTensor) pretrain_prob_prediction = source_model.predict( embeddings_pretrain).detach().cpu().numpy() adata.obs["sens_preds_pret"] = pretrain_prob_prediction[:, 1] adata.obs["sens_label_pret"] = pretrain_prob_prediction.argmax(axis=1) # # Use umap result to predict ## This section is removed because the dim problem and the performance problem # sc.tl.pca(adata, n_comps=max(50,2*dim_au_out),svd_solver='arpack') # sc.tl.umap(adata, n_components=dim_au_out) # embeddings_umap = torch.FloatTensor(adata.obsm["X_umap"]).to(device) # umap_prob_prediction = source_model.predict(embeddings_umap).detach().cpu().numpy() # adata.obs["sens_preds_umap"] = umap_prob_prediction[:,1] # adata.obs["sens_label_umap"] = umap_prob_prediction.argmax(axis=1) # # Use tsne result to predict # #sc.tl.tsne(adata, n_pcs=dim_au_out) # X_pca = adata.obsm["X_pca"] # # Replace tsne by pac beacause TSNE is very slow # X_tsne = adata.obsm["X_umap"] # #X_tsne = TSNE(n_components=dim_au_out,method='exact').fit_transform(X_pca) # embeddings_tsne = torch.FloatTensor(X_tsne).to(device) # tsne_prob_prediction = source_model.predict(embeddings_tsne).detach().cpu().numpy() # adata.obs["sens_preds_tsne"] = tsne_prob_prediction[:,1] # adata.obs["sens_label_tsne"] = tsne_prob_prediction.argmax(axis=1) # adata.obsm["X_tsne_pret"] = X_tsne # Add embeddings to the adata object embeddings_pretrain = embeddings_pretrain.detach().cpu().numpy() adata.obsm["X_pre"] = embeddings_pretrain ################################################# END SECTION OF SC MODEL PRETRAININIG ################################################# ################################################# START SECTION OF TRANSFER LEARNING TRAINING ################################################# # Using ADDA transfer learning if args.transfer == 'ADDA': # Set discriminator model discriminator = Predictor(input_dim=dim_au_out, output_dim=2) discriminator.to(device) loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Adversairal trainning discriminator, encoder, report_, report2_ = t.train_ADDA_model( source_encoder, encoder, discriminator, dataloaders_source, dataloaders_pretrain, loss_d, loss_d, # Should here be all optimizer d? optimizer_d, optimizer_d, exp_lr_scheduler_d, exp_lr_scheduler_d, epochs, device, target_model_path) logging.info("Transfer ADDA finished") # DaNN model elif args.transfer == 'DaNN': # Set predictor loss loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Set DaNN model DaNN_model = DaNN(source_model=source_encoder, target_model=encoder) DaNN_model.to(device) def loss(x, y, GAMMA=args.GAMMA_mmd): result = mmd.mmd_loss(x, y, GAMMA) return result loss_disrtibution = loss # Tran DaNN model DaNN_model, report_ = t.train_DaNN_model( DaNN_model, dataloaders_source, dataloaders_pretrain, # Should here be all optimizer d? optimizer_d, loss_d, epochs, exp_lr_scheduler_d, dist_loss=loss_disrtibution, load=load_model, weight=args.mmd_weight, save_path=target_model_path + "_DaNN.pkl") encoder = DaNN_model.target_model source_model = DaNN_model.source_model logging.info("Transfer DaNN finished") if (load_model == False): ut.plot_loss(report_[0], path="figures/train_loss_" + now + ".pdf") ut.plot_loss(report_[1], path="figures/mmd_loss_" + now + ".pdf", set_ylim=False) if (args.dimreduce != 'CVAE'): # Attribute test using integrated gradient # Generate a target model including encoder and predictor target_model = TargetModel(source_model, encoder) # Allow require gradients and process label X_allTensor.requires_grad_() # Run integrated gradient check # Return adata and feature integrated gradient ytarget_allPred = target_model(X_allTensor).detach().cpu().numpy() ytarget_allPred = ytarget_allPred.argmax(axis=1) adata, attrp1, senNeu_c0_genes, senNeu_c1_genes = ut.integrated_gradient_differential( net=target_model, input=X_allTensor, clip="positive", target=ytarget_allPred, adata=adata, ig_fc=1, save_name=reduce_model + args.predictor + prediction + select_drug + "sensNeuron" + now) adata, attrn1, resNeu_c0_genes, resNeu_c1_genes = ut.integrated_gradient_differential( net=target_model, input=X_allTensor, clip="negative", target=ytarget_allPred, adata=adata, ig_fc=1, save_name=reduce_model + args.predictor + prediction + select_drug + "restNeuron" + now) sc.pl.heatmap(attrp1, senNeu_c0_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_seNc0_" + now, show=False) sc.pl.heatmap(attrp1, senNeu_c1_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_seNc1_" + now, show=False) sc.pl.heatmap(attrn1, resNeu_c0_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_reNc0_" + now, show=False) sc.pl.heatmap(attrn1, resNeu_c1_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_reNc1_" + now, show=False) # CHI2 Test on predictive features SFD = SelectFdr(chi2) SFD.fit(adata.raw.X, ytarget_allPred) adata.raw.var['chi2_pval'] = SFD.pvalues_ adata.raw.var['chi2_score'] = SFD.scores_ df_chi2_genes = adata.raw.var[ (SFD.pvalues_ < 0.05) & (adata.raw.var.highly_variable == True) & (adata.raw.var.n_cells > args.min_c)] df_chi2_genes.sort_values(by="chi2_pval", ascending=True, inplace=True) df_chi2_genes.to_csv("saved/results/chi2_pval_genes" + args.predictor + prediction + select_drug + now + '.csv') else: print() ################################################# END SECTION OF TRANSER LEARNING TRAINING ################################################# ################################################# START SECTION OF PREPROCESSING FEATURES ################################################# # Extract feature embeddings # Extract prediction probabilities if (args.dimreduce != "CVAE"): embedding_tensors = encoder.encode(X_allTensor) else: embedding_tensors = encoder.encode(X_allTensor, C_allTensor) prediction_tensors = source_model.predictor(embedding_tensors) embeddings = embedding_tensors.detach().cpu().numpy() predictions = prediction_tensors.detach().cpu().numpy() # Transform predict8ion probabilities to 0-1 labels if (prediction == "regression"): adata.obs["sens_preds"] = predictions else: adata.obs["sens_preds"] = predictions[:, 1] adata.obs["sens_label"] = predictions.argmax(axis=1) adata.obs["sens_label"] = adata.obs["sens_label"].astype('category') adata.obs["rest_preds"] = predictions[:, 0] adata.write("saved/adata/before_ann" + data_name + now + ".h5ad") ################################################# END SECTION OF PREPROCESSING FEATURES ################################################# ################################################# START SECTION OF ANALYSIS AND POST PROCESSING ################################################# # Pipeline of scanpy # Add embeddings to the adata package adata.obsm["X_Trans"] = embeddings #sc.tl.umap(adata) sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_Trans") # Use t-sne on transfer learning features sc.tl.tsne(adata, use_rep="X_Trans") # Leiden on the data # sc.tl.leiden(adata) # Plot tsne sc.pl.tsne(adata, save=data_name + now, color=["leiden"], show=False) # Differenrial expression genes sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') sc.pl.rank_genes_groups(adata, n_genes=args.n_DE_genes, sharey=False, save=data_name + now, show=False) # Differenrial expression genes across 0-1 classes sc.tl.rank_genes_groups(adata, 'sens_label', method='wilcoxon') adata = ut.de_score(adata, clustername='sens_label') # save DE genes between 0-1 class for label in [0, 1]: try: df_degs = get_de_dataframe(adata, label) df_degs.to_csv("saved/results/DEGs_class_" + str(label) + args.predictor + prediction + select_drug + now + '.csv') except: logging.warning("Only one class, no two calsses critical genes") # Generate reports of scores report_df = args_df # Data specific benchmarking sens_pb_pret = adata.obs['sens_preds_pret'] lb_pret = adata.obs['sens_label_pret'] # sens_pb_umap = adata.obs['sens_preds_umap'] # lb_umap = adata.obs['sens_label_umap'] # sens_pb_tsne = adata.obs['sens_preds_tsne'] # lb_tsne = adata.obs['sens_label_tsne'] if ('sensitive' in adata.obs.keys()): report_df = report_df.T Y_test = adata.obs['sensitive'] sens_pb_results = adata.obs['sens_preds'] lb_results = adata.obs['sens_label'] le_sc = LabelEncoder() le_sc.fit(['Resistant', 'Sensitive']) label_descrbie = le_sc.inverse_transform(Y_test) adata.obs['sens_truth'] = label_descrbie color_list = ["sens_truth", "sens_label", 'sens_preds'] color_score_list = [ "Sensitive_score", "Resistant_score", "1_score", "0_score" ] sens_score = pearsonr(adata.obs["sens_preds"], adata.obs["Sensitive_score"])[0] resistant_score = pearsonr(adata.obs["rest_preds"], adata.obs["Resistant_score"])[0] report_df['prob_sens_pearson'] = sens_score report_df['prob_rest_pearson'] = resistant_score try: cluster_score_sens = pearsonr(adata.obs["1_score"], adata.obs["Sensitive_score"])[0] report_df['sens_pearson'] = cluster_score_sens except: logging.warning( "Prediction score 1 not exist, fill adata with 0 values") adata.obs["1_score"] = np.zeros(len(adata)) try: cluster_score_resist = pearsonr(adata.obs["0_score"], adata.obs["Resistant_score"])[0] report_df['rest_pearson'] = cluster_score_resist except: logging.warning( "Prediction score 0 not exist, fill adata with 0 values") adata.obs["0_score"] = np.zeros(len(adata)) #if (data_name in ['GSE110894','GSE117872']): ap_score = average_precision_score(Y_test, sens_pb_results) ap_pret = average_precision_score(Y_test, sens_pb_pret) # ap_umap = average_precision_score(Y_test, sens_pb_umap) # ap_tsne = average_precision_score(Y_test, sens_pb_tsne) report_dict = classification_report(Y_test, lb_results, output_dict=True) f1score = report_dict['weighted avg']['f1-score'] report_df['f1_score'] = f1score classification_report_df = pd.DataFrame(report_dict).T classification_report_df.to_csv("saved/results/clf_report_" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') # report_dict_umap = classification_report(Y_test, lb_umap, output_dict=True) # classification_report_umap_df = pd.DataFrame(report_dict_umap).T # classification_report_umap_df.to_csv("saved/results/clf_umap_report_" + reduce_model + args.predictor+ prediction + select_drug+now + '.csv') report_dict_pret = classification_report(Y_test, lb_pret, output_dict=True) classification_report_pret_df = pd.DataFrame(report_dict_pret).T classification_report_pret_df.to_csv("saved/results/clf_pret_report_" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') # report_dict_tsne = classification_report(Y_test, lb_tsne, output_dict=True) # classification_report_tsne_df = pd.DataFrame(report_dict_tsne).T # classification_report_tsne_df.to_csv("saved/results/clf_tsne_report_" + reduce_model + args.predictor+ prediction + select_drug+now + '.csv') try: auroc_score = roc_auc_score(Y_test, sens_pb_results) auroc_pret = average_precision_score(Y_test, sens_pb_pret) # auroc_umap = average_precision_score(Y_test, sens_pb_umap) # auroc_tsne = average_precision_score(Y_test, sens_pb_tsne) except: logging.warning("Only one class, no ROC") auroc_pret = auroc_umap = auroc_tsne = auroc_score = 0 report_df['auroc_score'] = auroc_score report_df['ap_score'] = ap_score report_df['auroc_pret'] = auroc_pret report_df['ap_pret'] = ap_pret # report_df['auroc_umap'] = auroc_umap # report_df['ap_umap'] = ap_umap # report_df['auroc_tsne'] = auroc_tsne # report_df['ap_tsne'] = ap_tsne ap_title = "ap: " + str(Decimal(ap_score).quantize(Decimal('0.0000'))) auroc_title = "roc: " + str( Decimal(auroc_score).quantize(Decimal('0.0000'))) title_list = ["Ground truth", "Prediction", "Probability"] else: color_list = ["leiden", "sens_label", 'sens_preds'] title_list = ['Cluster', "Prediction", "Probability"] color_score_list = color_list # Simple analysis do neighbors in adata using PCA embeddings #sc.pp.neighbors(adata) # Run UMAP dimension reduction sc.pp.neighbors(adata) sc.tl.umap(adata) # Run leiden clustering # sc.tl.leiden(adata,resolution=leiden_res) # Plot uamp # sc.pl.umap(adata,color=[color_list[0],'sens_label_umap','sens_preds_umap'],save=data_name+args.transfer+args.dimreduce+now,show=False,title=title_list) # Plot transfer learning on umap sc.pl.umap(adata, color=color_list + color_score_list, save=data_name + args.transfer + args.dimreduce + "umap_all" + now, show=False) sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(4, 3), facecolor='white') sc.pl.umap(adata, color=['sensitivity', 'leiden', 'sens_label', 'sens_preds'], title=[ 'Cell sensitivity', 'Cell clusters', 'Transfer learning prediction', 'Prediction probability' ], save=data_name + args.transfer + args.dimreduce + "umap_pred" + now, show=False, ncols=4) sc.pl.umap(adata, color=color_score_list, title=[ 'Sensitive gene score', 'Resistant gene score', 'Sensitive gene score (prediction)', 'Resistant gene score (prediction)' ], save=data_name + args.transfer + args.dimreduce + "umap_scores" + now, show=False, ncols=2) # sc.pl.umap(adata,color=['Sample name'], # save=data_name+args.transfer+args.dimreduce+"umap_sm"+now,show=False,ncols=4) try: sc.pl.umap(adata, color=adata.var.sort_values( "integrated_gradient_sens_class0").head().index, save=data_name + args.transfer + args.dimreduce + "_cgenes0_" + now, show=False) sc.pl.umap(adata, color=adata.var.sort_values( "integrated_gradient_sens_class1").head().index, save=data_name + args.transfer + args.dimreduce + "_cgenes1_" + now, show=False) # c0_genes = df_11_genes.loc[df_11_genes.pval<0.05].head().index # c1_genes = df_00_genes.loc[df_00_genes.pval<0.05].head().index # sc.pl.umap(adata,color=c0_genes,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_cgenes0_TL"+now,show=False) # sc.pl.umap(adata,color=c1_genes,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_cgenes1_TL"+now,show=False) except: logging.warning("IG results not avaliable") # Run embeddings using transfered embeddings sc.pp.neighbors(adata, use_rep='X_Trans', key_added="Trans") sc.tl.umap(adata, neighbors_key="Trans") sc.tl.leiden(adata, neighbors_key="Trans", key_added="leiden_trans", resolution=leiden_res) sc.pl.umap(adata, color=color_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_TL" + now, show=False, title=title_list) # Plot cell score on umap sc.pl.umap(adata, color=color_score_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_score_TL" + now, show=False, title=color_score_list) # This tsne is based on transfer learning feature sc.pl.tsne(adata, color=color_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_TL" + now, show=False, title=title_list) # Use tsne origianl version to visualize sc.tl.tsne(adata) # This tsne is based on transfer learning feature # sc.pl.tsne(adata,color=[color_list[0],'sens_label_tsne','sens_preds_tsne'],save=data_name+args.transfer+args.dimreduce+"_original_tsne"+now,show=False,title=title_list) # Plot tsne of the pretrained (autoencoder) embeddings sc.pp.neighbors(adata, use_rep='X_pre', key_added="Pret") sc.tl.umap(adata, neighbors_key="Pret") sc.tl.leiden(adata, neighbors_key="Pret", key_added="leiden_Pret", resolution=leiden_res) sc.pl.umap(adata, color=[color_list[0], 'sens_label_pret', 'sens_preds_pret'], neighbors_key="Pret", save=data_name + args.transfer + args.dimreduce + "_umap_Pretrain_" + now, show=False) # Ari between two transfer learning embedding and sensitivity label ari_score_trans = adjusted_rand_score(adata.obs['leiden_trans'], adata.obs['sens_label']) ari_score = adjusted_rand_score(adata.obs['leiden'], adata.obs['sens_label']) pret_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_Pret']) transfer_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_trans']) sc.pl.umap(adata, color=['leiden_origin', 'leiden_trans', 'leiden_Pret'], save=data_name + args.transfer + args.dimreduce + "_comp_Pretrain_" + now, show=False) #report_df = args_df report_df['ari_score'] = ari_score report_df['ari_trans_score'] = ari_score_trans report_df['ari_pre_umap'] = pret_ari_score report_df['ari_trans_umap'] = transfer_ari_score # Trajectory of adata adata, corelations = trajectory(adata, root_key='sensitive', genes_vis=senNeu_c0_genes[:5], root=1, now=now, plot=True) gene_cor = {} # Trajectory for g in np.array(senNeu_c0_genes): gene = g express_vec = adata[:, gene].X corr = pearsonr( np.array(express_vec).ravel(), np.array(adata.obs["dpt_pseudotime"]))[0] gene_cor[gene] = corr try: for k in corelations.keys(): report_df['cor_dpt_' + k] = corelations[k][0] report_df['cor_pvl_' + k] = corelations[k][1] except: logging.warning( "Some of the coorelation cannot be reterived from the dictional") ################################################# END SECTION OF ANALYSIS AND POST PROCESSING ################################################# ################################################# START SECTION OF ANALYSIS FOR BULK DATA ################################################# # bdata = sc.AnnData(data_r) # bdata.obs = label_r # bulk_degs={} # sc.tl.rank_genes_groups(bdata, select_drug, method='wilcoxon') # bdata = ut.de_score(bdata,select_drug) # for label in set(label_r.loc[:,select_drug]): # try: # df_degs = get_de_dataframe(bdata,label) # bulk_degs[label] = df_degs.iloc[:50,:].names # df_degs.to_csv("saved/results/DEGs_bulk_" +str(label)+ args.predictor+ prediction + select_drug+now + '.csv') # except: # logging.warning("Only one class, no two calsses critical genes") # Xsource_allTensor = torch.FloatTensor(data_r.values).to(device) # Ysource_preTensor = source_model(Xsource_allTensor) # Ysource_prediction = Ysource_preTensor.detach().cpu().numpy() # bdata.obs["sens_preds"] = Ysource_prediction[:,1] # bdata.obs["sens_label"] = Ysource_prediction.argmax(axis=1) # bdata.obs["sens_label"] = bdata.obs["sens_label"].astype('category') # bdata.obs["rest_preds"] = Ysource_prediction[:,0] # sc.tl.score_genes(adata, bulk_degs['sensitive'],score_name="bulk_sens_score" ) # sc.tl.score_genes(adata, bulk_degs['resistant'],score_name="bulk_rest_score" ) # sc.pl.umap(adata,color=['bulk_sens_score','bulk_rest_score'],save=data_name+args.transfer+args.dimreduce+"umap_bg_all"+now,show=False) # try: # bulk_score_sens = pearsonr(adata.obs["1_score"],adata.obs["bulk_sens_score"])[0] # report_df['bulk_sens_pearson'] = bulk_score_sens # cluster_score_resist = pearsonr(adata.obs["0_score"],adata.obs["bulk_rest_score"])[0] # report_df['bulk_rest_pearson'] = cluster_score_resist # except: # logging.warning("Bulk level gene score not exist") # Save adata adata.write("saved/adata/" + data_name + now + ".h5ad") # Save report report_df = report_df.T report_df.to_csv("saved/results/report" + reduce_model + args.predictor + prediction + select_drug + now + '.csv')