def select_multisurf(X, y, percentile=10):
    num = math.ceil(X.shape[0] * percentile / 100)
    selector = MultiSURF(n_features_to_select=num,
                         discrete_threshold=3,
                         n_jobs=-1)
    selector.fit(X, y)
    return selector
Example #2
0
    def fit(self, X, y=None, **kwargs):

        X, y = self.check_X_y(X, y)
        self.check_params(X, y)

        selector = MultiSURF(n_features_to_select=self.num_features, )
        selector.fit(X, y)

        _support = selector.top_features_[:self.num_features]
        self.support = self.check_support(_support)

        return self
def multisurf_fs(X_df,X_train_all,X_test_all,y_train):
    '''MultiSURF for feature selection'''
    fs = MultiSURF(discrete_threshold = 1000, n_jobs=1)
    fs.fit(X_train_all, y_train)

    feature_scores = fs.feature_importances_
    feature_ids = np.where(feature_scores>=0)[0]
    selected_features = np.array(X_df.columns[feature_ids])

    #New X_train and X_test matrices
    X_train = X_train_all[:,feature_ids]
    X_test = X_test_all[:,feature_ids]

    return selected_features, feature_scores, X_train, X_test
Example #4
0
def test_multisurf_pandas_inputs():
    """Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs"""
    np.random.seed(320931)
    clf = make_pipeline(MultiSURF(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))
    assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3,
                                   n_jobs=-1)) > 0.7
Example #5
0
def test_multisurf_pipeline():
    """Ensure that MultiSURF works in a sklearn pipeline when it is parallelized"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Example #6
0
def test_multisurf_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Example #7
0
def test_multisurf_pipeline_cont_endpoint():
    """Check: Data (Continuous Endpoint): MultiSURF works in a sklearn pipeline"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURF(n_features_to_select=2),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(np.mean(cross_val_score(clf, features_cont_endpoint,
                                       labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
Example #8
0
def test_multisurf_pipeline_mixed_attributes():
    """Check: Data (Mixed Attributes): MultiSURF works in a sklearn pipeline"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURF(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features_mixed_attributes,
                                   labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
Example #9
0
def test_multisurf_pipeline_missing_values():
    """Ensure that MultiSURF works in a sklearn pipeline with missing values"""
    np.random.seed(320931)

    clf = make_pipeline(MultiSURF(n_features_to_select=2, n_jobs=-1),
                        Imputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
Example #10
0
def run_multisurf(xTrain, yTrain, cv_count, data_name, output_folder, randSeed,
                  ordered_feature_names, algorithm):
    #Run mutlisurf
    filename = output_folder + '/' + algorithm + '_' + data_name + '_' + str(
        cv_count) + '_Train.txt'

    clf = MultiSURF().fit(xTrain, yTrain)
    scores = clf.feature_importances_

    scoreDict, score_sorted_features = sort_save_fi_scores(
        scores, ordered_feature_names, algorithm, filename)

    return scores, scoreDict, score_sorted_features
def rebate(df, target, n_features):
    """
    Run the ReBATE relief algorithm on a dataframe, returning the reduced df.

    Args:
        df (pandas.DataFrame): A dataframe
        target (str): The target key (must be present in df)
        n_features (int): The number of features desired to be returned.

    Returns:

    """
    X = df.drop(target, axis=1)
    y = df[target]
    rf = MultiSURF(n_features_to_select=n_features, n_jobs=-1)
    matrix = rf.fit_transform(X.values, y.values)
    feats = []
    for c in matrix.T:
        for f in X.columns.values:
            if np.array_equal(c, X[f].values) and f not in feats:
                feats.append(f)
    return df[feats]
Example #12
0
def get_selector(name, estimator=None, n_features_to_select=None, **params):
    if name == 'RobustSelector':
        return RobustSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
         ('cv', 'verbose')))
    elif name == 'MaxFeatures':
        return SelectFromModel(estimator, threshold=-np.inf, max_features=n_features_to_select)
    elif name == 'RandomSubsetSelector':
        return RandomSubsetSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
        ('n_subsets', 'subset_size', 'random_state')))
    elif name == 'FeatureImportanceThreshold':
        return SelectFromModel(estimator, **search_dict(params, 'threshold'))
    elif name == 'RFE':
        return RFE(estimator, n_features_to_select=n_features_to_select, **search_dict(params, 
        ('step', 'verbose')))
    elif name == 'RFECV':
        return RFECV(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
         ('step', 'cv', 'verbose')))
    elif name == 'FoldChangeFilter':
        return FoldChangeFilter(**search_dict(params,
        ('threshold', 'direction', 'below', 'pseudo_count')))
    elif name == 'ZeroFractionFilter':
        return ZeroFractionFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'RpkmFilter':
        return RpkmFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'RpmFilter':
        return RpmFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'DiffExpFilter':
        return DiffExpFilter(max_features=n_features_to_select, **search_dict(params,
        ('threshold', 'script', 'temp_dir', 'score_type', 'method')))
    elif name == 'ReliefF':
        from skrebate import ReliefF
        return ReliefF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'n_neighbors', 'discrete_limit')))
    elif name == 'SURF':
        from skrebate import SURF
        return SURF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'discrete_limit')))
    elif name == 'MultiSURF':
        from skrebate import MultiSURF
        return MultiSURF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'discrete_limit')))
    elif name == 'SIS':
        return SIS(n_features_to_select=n_features_to_select, 
            **search_dict(params, ('temp_dir', 'sis_params')))
    elif name == 'NullSelector':
        return NullSelector()
    else:
        raise ValueError('unknown selector: {}'.format(name))
def job(cv_train_path, experiment_path, random_state, class_label,
        instance_label, instance_subset, algorithm, njobs, use_TURF, TURF_pct):
    job_start_time = time.time()
    random.seed(random_state)
    np.random.seed(random_state)

    dataset_name = cv_train_path.split('/')[-3]
    data = pd.read_csv(cv_train_path, sep=',')
    if instance_label != 'None':
        dataFeatures = data.drop([class_label, instance_label], axis=1).values
    else:
        dataFeatures = data.drop([class_label], axis=1).values
    dataOutcome = data[class_label].values
    header = data.columns.values.tolist()
    header.remove(class_label)
    if instance_label != 'None':
        header.remove(instance_label)
    cvCount = cv_train_path.split('/')[-1].split("_")[-2]

    use_TURF = use_TURF != 'False'

    #Mutual Information
    if algorithm == 'mi':
        #Run Mutual Information
        outname = "mutualinformation"
        outpath = experiment_path + '/' + dataset_name + "/" + outname + "/scores_cv_" + str(
            cvCount) + '.csv'
        scores = mutual_info_classif(dataFeatures,
                                     dataOutcome,
                                     random_state=random_state)

    #MultiSURF
    elif algorithm == 'ms':
        #Format instance sampled dataset (prevents MultiSURF from running a very long time in large instance spaces)
        formatted = np.insert(dataFeatures, dataFeatures.shape[1], dataOutcome,
                              1)
        choices = np.random.choice(formatted.shape[0],
                                   min(instance_subset, formatted.shape[0]),
                                   replace=False)
        newL = []
        for i in choices:
            newL.append(formatted[i])
        formatted = np.array(newL)
        dataFeatures = np.delete(formatted, -1, axis=1)
        dataPhenotypes = formatted[:, -1]

        #Run MultiSURF
        outname = "multisurf"
        outpath = experiment_path + '/' + dataset_name + "/" + outname + "/scores_cv_" + str(
            cvCount) + '.csv'
        if use_TURF:
            clf = TURF(MultiSURF(n_jobs=njobs),
                       pct=TURF_pct).fit(dataFeatures, dataPhenotypes)
        else:
            clf = MultiSURF(n_jobs=njobs).fit(dataFeatures, dataPhenotypes)
        scores = clf.feature_importances_
    else:
        raise Exception("Feature importance algorithm not found")

    #Save sorted feature importance scores:
    scoreDict, score_sorted_features = sort_save_fi_scores(
        scores, header, outpath, outname)

    #Save CV MI Scores to pickled file
    if not os.path.exists(experiment_path + '/' + dataset_name + "/" +
                          outname + "/pickledForPhase4"):
        os.mkdir(experiment_path + '/' + dataset_name + "/" + outname +
                 "/pickledForPhase4")

    outfile = open(
        experiment_path + '/' + dataset_name + "/" + outname +
        "/pickledForPhase4/" + str(cvCount), 'wb')
    pickle.dump([scores, scoreDict, score_sorted_features], outfile)
    outfile.close()

    #Save Runtime
    runtime_file = open(
        experiment_path + '/' + dataset_name + '/runtime/runtime_' + outname +
        '_CV_' + str(cvCount) + '.txt', 'w')
    runtime_file.write(str(time.time() - job_start_time))
    runtime_file.close()

    # Print completion
    print(dataset_name + " CV" + str(cvCount) + " phase 3 " + outname +
          " evaluation complete")
    job_file = open(
        experiment_path + '/jobsCompleted/job_' + outname + '_' +
        dataset_name + '_' + str(cvCount) + '.txt', 'w')
    job_file.write('complete')
    job_file.close()
Example #14
0
def rank_features_by_rebate_methods(data_split_list,
                                    fs_method,
                                    iterate,
                                    remove_percent=0.1,
                                    verbose=False):
    ## 0. Input arguments:
    # data_split_list: data frame that contains the learning data
    # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar'
    # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large)
    # remove_percent: percentage of features removed at each iteration (only applied when iterate = True)
    # verbose: whether to show progress by each fold: True of False

    ## 1. Define function for feature ranking method
    # SURF
    if fs_method == 'SURF':
        # Implement TURF extension when 'iterate == True'
        if iterate == True:
            fs = TuRF(core_algorithm='SURF', pct=remove_percent)
        else:
            fs = SURF()
    # SURFstar
    if fs_method == 'SURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='SURFstar', pct=remove_percent)
        else:
            fs = SURFstar()
    # MultiSURF
    if fs_method == 'MultiSURF':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent)
        else:
            fs = MultiSURF()
    # MultiSURFstar
    if fs_method == 'MultiSURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent)
        else:
            fs = MultiSURFstar()

    ## 2. Perform feature ranking on each fold of training data
    # iterate by folds
    feat_impt_dict = {}
    for i in range(0, len(data_split_list)):
        # intermediate output
        if verbose == True:
            print('Computing feature importance scores using data from fold ' +
                  str(i) + '\n')
# obtain training feature matrix and response vector
        feat_train, label_train, _, _ = data_split_list[i]
        # fit feature ranking model using the specified method
        if iterate == True:
            fs.fit(feat_train.values, label_train.values, list(feat_train))
        else:
            fs.fit(feat_train.values, label_train.values)
        # output feature importance scores in a data frame
        fold_name = 'Fold_' + str(i)
        feat_impt_dict[fold_name] = fs.feature_importances_
    # aggregate results from muliple folds into one data frame
    feat_impt_df = pd.DataFrame(feat_impt_dict)
    feat_impt_df.index = feat_train.columns

    return feat_impt_df
Example #15
0
        clf = RandomForestClassifier().fit(X_train, y_train)
        tmp_score = clf.score(X_test, y_test)
        acc_arr.append(tmp_score)
    return np.mean(acc_arr), chosen_names


try:
    X, y, names, groups, _ = ml_data_parser(argv[1])
except IndexError:
    X, y, names, groups, _ = ml_data_parser('30_data.csv')
# X,y,names,groups,_=ml_data_parser('74_data.csv')
# print('graph_threshold,multisurf_threshold,model_type,model_acc,num_vars')
X = np.array(X)
y = np.array(y)
num_vars = len(names)
fs = MultiSURF().fit(X, y)
ms_array = list(fs.feature_importances_)
feature_importance = {}
num_dic = {}
trans_x = np.transpose(X)
max_val = 0
for i in range(num_vars):
    feature_importance[names[i]] = ms_array[i]
    num_dic[i] = ms_array[i]
    if max_val < num_dic[i]:
        max_val = num_dic[i]
        best_feature = i
for a in range(10):
    x1 = X[:, best_feature].reshape(-1, 1)
    group_kfold = GroupKFold(n_splits=10)
    group_kfold.get_n_splits(x1, y, groups)
Example #16
0
def job(experiment_path,cv):
    job_start_time = time.time()

    file = open(experiment_path+'/phase1pickle', 'rb')
    phase1_pickle =  pickle.load(file)
    file.close()

    cv_info = phase1_pickle[0]
    learning_iterations = phase1_pickle[3]
    N = phase1_pickle[4]
    nu = phase1_pickle[5]
    attribute_tracking_method = phase1_pickle[6]
    random_state = phase1_pickle[7]
    class_label = phase1_pickle[8]
    feature_selection_sample_size = phase1_pickle[10]
    rule_compaction_method = phase1_pickle[11]
    data_headers = phase1_pickle[1][2]

    train_data_features = cv_info[cv][0]
    train_data_phenotypes = cv_info[cv][1]
    train_instance_labels = cv_info[cv][2]
    train_group_labels = cv_info[cv][3]
    test_data_features = cv_info[cv][4]
    test_data_phenotypes = cv_info[cv][5]
    test_instance_labels = cv_info[cv][6]
    test_group_labels = cv_info[cv][7]
    inst_label = cv_info[cv][8]
    group_label = cv_info[cv][9]

    # Create CV directory
    if not os.path.exists(experiment_path + '/CV_' + str(cv)):
        os.mkdir(experiment_path + '/CV_' + str(cv))

    #MultiSURF Feature Scoring
    merged = np.insert(train_data_features, train_data_features.shape[1], train_data_phenotypes, 1)
    rb_sample = np.random.choice(merged.shape[0], min(feature_selection_sample_size,merged.shape[0]), replace=False)
    new_data = []
    for i in rb_sample:
        new_data.append(merged[i])
    new_data = np.array(new_data)
    data_featuresR = np.delete(new_data, -1, axis=1)
    data_phenotypesR = new_data[:, -1]
    featureimportance_model = MultiSURF()
    featureimportance_model.fit(data_featuresR, data_phenotypesR)
    scores = featureimportance_model.feature_importances_

    # Train ExSTraCS Model
    model = ExSTraCS(learning_iterations=learning_iterations, N=N, nu=nu,attribute_tracking_method=attribute_tracking_method,
                     rule_compaction=rule_compaction_method,random_state=random_state,do_correct_set_subsumption=False,expert_knowledge=scores)
    model.fit(train_data_features, train_data_phenotypes)

    outfile = open(experiment_path + '/CV_' + str(cv) + '/model', 'wb')
    pickle.dump(model, outfile)
    outfile.close()

    # Export Testing Accuracy for each instance
    predicted_data_phenotypes = model.predict(test_data_features)
    equality = np.equal(predicted_data_phenotypes, test_data_phenotypes)
    with open(experiment_path + '/CV_' + str(cv) + '/instTestingAccuracy.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([inst_label, 'isCorrect'])
        for i in range(len(test_instance_labels)):
            writer.writerow([test_instance_labels[i], 1 if equality[i] else 0])
    file.close()

    # Export Aggregate Testing Accuracy
    outfile = open(experiment_path + '/CV_' + str(cv) + '/testingAccuracy.txt', mode='w')
    outfile.write(str(model.score(test_data_features, test_data_phenotypes)))
    outfile.close()

    # Save train and testing datasets into csvs
    with open(experiment_path + '/CV_' + str(cv) + '/trainDataset.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(list(data_headers) + [class_label, inst_label, group_label])
        for i in range(len(train_instance_labels)):
            writer.writerow(list(train_data_features[i]) + [train_data_phenotypes[i]] + [train_instance_labels[i]] + [
                train_group_labels[i]])
    file.close()

    with open(experiment_path + '/CV_' + str(cv) + '/testDataset.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(list(data_headers) + [class_label, inst_label, group_label])
        for i in range(len(test_instance_labels)):
            writer.writerow(list(test_data_features[i]) + [test_data_phenotypes[i]] + [test_instance_labels[i]] + [
                test_group_labels[i]])
    file.close()

    # Get AT Scores for each instance
    AT_scores = model.get_attribute_tracking_scores(instance_labels=np.array(train_instance_labels))

    # Normalize AT Scores
    normalized_AT_scores = []
    for i in range(len(AT_scores)):
        normalized = AT_scores[i][1]
        max_score = max(normalized)
        for j in range(len(normalized)):
            if max_score != 0:
                normalized[j] /= max_score
            else:
                normalized[j] = 0
        normalized_AT_scores.append(list(normalized))

    # Save Normalized AT Scores
    with open(experiment_path + '/CV_' + str(cv) + '/normalizedATScores.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([inst_label] + list(data_headers))
        for i in range(len(train_instance_labels)):
            writer.writerow([train_instance_labels[i]] + normalized_AT_scores[i])
    file.close()

    # Save Runtime
    runtime_file = open(experiment_path + '/CV_' + str(cv) + '/runtime.txt', 'w')
    runtime_file.write(str(time.time() - job_start_time))
    runtime_file.close()

    # Print completion
    print('CV '+str(cv) + " phase 1 complete")
Example #17
0
 def __init__(self, estimator=MultiSURF(), k=None):
     self.estimator = estimator
     self.k = k