def phenotype_imputation(data, config):
    ''' 
    Function to impute the labels on II based on the classifier learned on I.
    
    Parameters 
    ---------- 
    data : an object of class Dataset that contains: genotypes, covariates, 
        labels and information about random folds 

    config : an object of class ConfigState. It contains the user-entered 
        parameters in a YAML format.
        See the config_file parameter in the main script for more details.
    '''
    # Parameters for this task
    num_folds = data.num_folds  
    task_name    = "phenotype_imputation"
    n_estimators = config.get_entry(task_name, "n_estimators")
    romans_trn   = config.get_entry(task_name, "romans_used_for_learning")
    romans_tst   = config.get_entry(task_name, "romans_used_for_imputing")
    
    # Iterate through the folds: 
    i = 0
    size_of_two = find_vec_entries_that_contain(data.folds[:,0], romans_tst).shape[0]
    soft_labels = np.zeros((size_of_two, num_folds))
    X_scaled = preprocessing.scale(data.clin_covariate.transpose()).transpose()
    fpr = dict()
    tpr = dict()
    thres = dict()
    roc_auc = np.zeros(num_folds)
    for fold in data.folds.transpose():      
        logging.info("Fold=%d" % (i + 1))
        sel_trn = find_vec_entries_that_contain(fold,[romans_trn])
        sel_tst = find_vec_entries_that_contain(fold,[romans_tst])

        model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(),
                    n_estimators=n_estimators, max_samples=0.632, 
# for small set I   n_estimators=n_estimators, max_samples=0.8, 
                    max_features=5, 
                    bootstrap=True, bootstrap_features=True, oob_score=False, 
# for small set I   bootstrap=False, bootstrap_features=True, oob_score=False, 
                    n_jobs=1, random_state=None, verbose=0)
            
        model.fit(X_scaled[:,sel_trn].transpose(), data.labels[:,sel_trn].transpose())

        soft_labels[:,i] = model.predict_proba(X_scaled[:,sel_tst].transpose())[:,1]
        fpr[i], tpr[i], thres[i] = metrics.roc_curve(data.labels[0,sel_tst], soft_labels[:,i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])
        i+=1

    # Save the output of this task
    config.save_variable(task_name, "%f", soft_labels=soft_labels, roc_auc=roc_auc)
def univ_feature_sel(data, config):
    ''' 
    Do univariate feature selection  
     
    
    '''
    
    # Parameters
    task_name = "univ_feature_sel"
    romans_trn_gold     = config.get_entry(task_name, "golden_romans_used_for_learning")
    romans_trn_silver   = config.get_entry(task_name, "silver_romans_used_for_learning")
    
    # Load the output of the previous task(s)
    soft_labels = config.load_variable("phenotype_imputation", "soft_labels")
    
    # ---------------------------
    feature_ranking = np.ones((data.folds.shape[1], data.genotype.shape[1]))
    feature_pval = np.ones((data.folds.shape[1], data.genotype.shape[1]))    
    
    # Iterate through the folds:
    i = 0
    for fold in data.folds.transpose():
        logging.info("Fold=%d" % (i + 1))
        sel_trn_gold = find_vec_entries_that_contain(fold, romans_trn_gold)
        sel_trn_silver = find_vec_entries_that_contain(fold, romans_trn_silver)
        sel_trn = np.concatenate([sel_trn_gold, sel_trn_silver])
        trn_labels = np.concatenate([data.labels[0,sel_trn_gold], soft_labels[range(len(sel_trn_silver)),i]])
        j = 0
        # ---------------
        for genotype_snp in data.genotype.transpose():
            feature_pval[i,j] = pearsonr(genotype_snp[sel_trn], trn_labels)[1]                   
            j+=1
        # ---------------
        i+=1
        
    feature_ranking = feature_pval.argsort()
    # ---------------------------
    # Save the output of this task
    config.save_variable(task_name, "%d", feature_ranking=feature_ranking)