def phenotype_imputation(data, config): ''' Function to impute the labels on II based on the classifier learned on I. Parameters ---------- data : an object of class Dataset that contains: genotypes, covariates, labels and information about random folds config : an object of class ConfigState. It contains the user-entered parameters in a YAML format. See the config_file parameter in the main script for more details. ''' # Parameters for this task num_folds = data.num_folds task_name = "phenotype_imputation" n_estimators = config.get_entry(task_name, "n_estimators") romans_trn = config.get_entry(task_name, "romans_used_for_learning") romans_tst = config.get_entry(task_name, "romans_used_for_imputing") # Iterate through the folds: i = 0 size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0] soft_labels = np.zeros((size_of_two, num_folds)) X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose() fpr = dict() tpr = dict() thres = dict() roc_auc = np.zeros(num_folds) for fold in data.folds.transpose(): logging.info("Fold=%d" % (i + 1)) sel_trn = find_vec_entries_that_contain(fold,[romans_trn]) sel_tst = find_vec_entries_that_contain(fold,[romans_tst]) model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(), n_estimators=n_estimators, max_samples=0.632, # for small set I n_estimators=n_estimators, max_samples=0.8, max_features=5, bootstrap=True, bootstrap_features=True, oob_score=False, # for small set I bootstrap=False, bootstrap_features=True, oob_score=False, n_jobs=1, random_state=None, verbose=0) model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose()) soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1] fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) i+=1 # Save the output of this task config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
def univ_feature_sel(data, config): ''' Do univariate feature selection ''' # Parameters task_name = "univ_feature_sel" romans_trn_gold = config.get_entry(task_name, "golden_romans_used_for_learning") romans_trn_silver = config.get_entry(task_name, "silver_romans_used_for_learning") # Load the output of the previous task(s) soft_labels = config.load_variable("phenotype_imputation", "soft_labels") # --------------------------- feature_ranking = np.ones((data.folds.shape[1], data.genotype.shape[1])) feature_pval = np.ones((data.folds.shape[1], data.genotype.shape[1])) # Iterate through the folds: i = 0 for fold in data.folds.transpose(): logging.info("Fold=%d" % (i + 1)) sel_trn_gold = find_vec_entries_that_contain(fold, romans_trn_gold) sel_trn_silver = find_vec_entries_that_contain(fold, romans_trn_silver) sel_trn = np.concatenate([sel_trn_gold, sel_trn_silver]) trn_labels = np.concatenate([data.labels[0,sel_trn_gold], soft_labels[range(len(sel_trn_silver)),i]]) j = 0 # --------------- for genotype_snp in data.genotype.transpose(): feature_pval[i,j] = pearsonr(genotype_snp[sel_trn], trn_labels)[1] j+=1 # --------------- i+=1 feature_ranking = feature_pval.argsort() # --------------------------- # Save the output of this task config.save_variable(task_name, "%d", feature_ranking=feature_ranking)