def interactive_pipeline(X, Y, pca_n_components, random_forest_n):

    #remove missing values columns

    X.dropna(axis=1,  inplace=True)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    #cutoff by variance
    variance_threshold = 0.03
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)
    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
    X.drop(X.columns[to_drop], 1, inplace=True)
    #random forest
    k_best_features = random_forest_n
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    #PCA
    pca = PCA_Obj(X)
    X = pca.create_pca(pca_n_components)
    print("X.shape", X.shape)
    return X, Y

#feature_selection_pipeline_from_file()
Beispiel #2
0
def feature_selection(features, ideal_num=None):
    from sklearn.feature_selection import VarianceThreshold
    copy = np.copy(features)
    for i in range(8):
        sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
        sel.fit_transform(copy[i])
    return copy
Beispiel #3
0
def recursive_feature_selection(info_humans, info_bots, params, scale=False):

    X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale)

    print "first feature selection by variance test"
    skb = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X_new = skb.fit_transform(X)
    features_1 = features[skb.get_support()]

    print "second feature selection by ch2 test"
    skb = SelectKBest(chi2, k=200)
    # skb = SelectFpr(chi2, alpha=0.005)
    X_new = skb.fit_transform(X_new, y)
    features_2 = features_1[skb.get_support()]

    # skb = PCA(n_components=250)
    # X_new = skb.fit_transform(X_new, y)
    
    print "third feature selection by recursive featue elimination (RFECV)"
    clf = LogisticRegression(penalty=params['penalty'],
                             C=params['C'])
    # clf = SVC(kernel="linear")
    rfecv = RFECV(estimator=clf, step=1,
                  cv=cross_validation.StratifiedKFold(y, 5),
                  scoring='roc_auc', verbose=1)
    rfecv.fit(X_new, y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    
    return skb, rfecv
def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    auto_visualize_features(dataset.drop(subject_number_column, axis = 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]
    names = list(X)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    X.columns = names
    print("p0", X.shape)

    #cutoff by variance
    variance_threshold = 0.05
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

    X.drop(to_drop, axis = 1, inplace=True)
    print("p2",X.shape)


    #random forest
    k_best_features = 42
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X))
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    print("p3", processed_dataframe.shape)
    processed_dataframe.to_csv(processed_dataframe_path)


    #PCA
    pca = PCA_Obj(X)
    pca.explained_variance_graph(pca_explained_variance_graph_path)
    pca.print_components()
    n_components = 12
    X = pca.create_pca(n_components)
    pca.save_pca_data(features_after_pca, Y=Y)
    print("p4", X.shape)
Beispiel #5
0
 def varianceSelection(self, df, threashold=.8):
     if not isinstance(df, pandas.core.frame.DataFrame):
         logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
                                      datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
         sys.exit(1)
     sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
     sel.fit_transform(df)
     return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
Beispiel #6
0
 def print_variance(self, path=None):
     df = self.dataset.dropna(axis=1)
     df = df.drop('group', 1)
     standard_scaler = preprocessing.MinMaxScaler()
     data = standard_scaler.fit_transform(df)
     selector = VarianceThreshold()
     selector.fit_transform(data)
     result = sorted(zip(list(df), selector.variances_), key=lambda x: x[1])
     print("Variance")
     print(*result, sep="\n")
     if path:
         with open(os.path.join(path, "variance.txt"),"w") as f:
             for i in result:
                 f.write(str(i)+"\n")
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X, X_rdd = self.generate_dataset()
        result_local = local.fit_transform(X)
        result_dist = np.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_almost_equal(result_local, result_dist)

        X, X_rdd = self.generate_sparse_dataset()
        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    #auto_visualize_features(dataset.drop([subject_number_column], 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))

    #cutoff by variance
    variance_threshold = 0.03
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    X.drop(X.columns[to_drop], 1, inplace=True)

    print("p2",X.shape)

    #save new df
    processed_dataframe = pd.concat([X, Y, sbj], axis=1)
    processed_dataframe.to_csv(processed_dataframe_path)

    #random forest
    if random_forest:
        k_best_features = 31
        feature_importance = random_forest_selection.get_feature_importance(X,Y)
        random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance)
        processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
        processed_dataframe.to_csv(processed_dataframe_path)
    print("p4", processed_dataframe.shape)
    def vectorize_EX(self, columns, variance_thresh=0, train_only=False):

        print('Start vectorizing')
        start_time = time.time()
        hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english')

        train_dtm = hasher.fit_transform(
            self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1))
        print(hasher.get_feature_names())
        print('dtm train shape: ', train_dtm.shape)

        selector = VarianceThreshold(variance_thresh)
        train_dtm = selector.fit_transform(train_dtm)
        print('dtm train shape after variance thresh: ', train_dtm.shape)

        if not train_only:
            test_dtm = hasher.transform(
                self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1))

            print('dtm test shape: ', test_dtm.shape)
            test_dtm = selector.transform(test_dtm)
            print('dtm test shape after variance thresh: ', test_dtm.shape)

        print("Time: ", round(((time.time() - start_time)/60), 2))
        print('Complete vectorizing')
        if train_only:
            return train_dtm
        else:
            return (train_dtm, test_dtm)
Beispiel #10
0
def variance_cutoff(X,cutoff=0.8):
    """
    Set variance cutoff for variables
    """
    sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff)))
    X = sel.fit_transform(X)
    return X
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(description='Normalize the feature values')
    required = parser.add_argument_group('required options')

    required.add_argument('-x', '--outlist', required=True, help='File containing feature values')
    required.add_argument('-y', '--execlist', required=True, help='File containing exec list')
    
    args = parser.parse_args()

    #X = np.loadtxt(args.outlist, skiprows=1)
    np.set_printoptions(precision=2)
    X = np.genfromtxt(args.outlist, skiprows=1)
    X=np.nan_to_num(X)
    Y = np.loadtxt(args.execlist, ndmin=2)

    #f = open("trainlist","wb")
    #newResult = X/Y
    #sel = VarianceThreshold(threshold=(.8*(1-.8)))
    sel = VarianceThreshold(threshold=(.8*(1-.8)))
    result1 = sel.fit_transform(X)
    newResult = result1/Y
    #result2 = sel.fit_transform(newResult)

    #feature collection for test programs
    if os.path.isfile('eventlist'):
       features = np.genfromtxt('eventlist',dtype='str')
       featureFromVariance = sel.get_support(indices=True)
       text_file = open("variancefeatures.txt","w")
       for i in featureFromVariance:
           text_file.write(features[i])
           text_file.write("\n")
       text_file.close()

    np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train',4)
    train_x_new, id = extractID(train_x)
    del train_x
    train_x_clean, contentdict = cityclean(train_x_new)
    del id, train_x_new
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    del train_x_clean
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    del train_x_uniq
    
    #feature selection and modeling
    print "feature selection and modeling"
    exclusivefs(train_x_nor, train_y)
 def doFeatureSelection(self,features,target,k):
     features_int = np.array(features,dtype=float)
     target_int = np.array(target,dtype=float)
     sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
     features_new = sel.fit_transform(features_int)
     #features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int)
     return features_new
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
def main():
    args = getOptions()
    print args
    fn = "destreeSub.csv"
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
def main():
    args = getOptions()
    fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth)))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
#     ftsel = correlationSel()
#     ftsel.dosel(train_x_nor,train_y)
#     train_x_sel = ftsel.transform(train_x_nor)
#     test_x_sel = ftsel.transform(test_x_nor)
    print "modelsing"
    clf = GradientBoostingClassifier(loss='deviance', 
                                     learning_rate=args.lrate,
                                     n_estimators=args.nest,
                                     max_depth=args.maxdepth,
                                     verbose=1)
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
def featureSelectionVarianceThreshold(data, probability = 0.8):
    dataRaw = data[:, 2:]
    sel = VarianceThreshold(threshold=(probability*(1 - probability)))
    dataNew = sel.fit_transform(dataRaw)
    fd = open('History.txt','a')
    history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n'
    fd.write(history)
    fd.close()
    return np.c_[data[:, :2], dataNew]
def select_centroids(centroids):
    """
    :param centroids: learned centroids
    :return: new_centroids: (without centroids with variance < avg_variance(centroids))
    """
    sel = VarianceThreshold(threshold=np.var(centroids))
    new_centroids = sel.fit_transform(centroids.T)
    new_centroids = new_centroids.T
    return new_centroids
Beispiel #19
0
 def featureReduction(self, data,threshold_input = 0.99):
     '''
     feature reduction that only keep variables that the variance
     is greater than threshold.
     '''
     selector = VarianceThreshold(threshold = threshold_input)
     data = selector.fit_transform(data)
     print 'Feature Selected with threshold ', threshold_input, data.shape
     return data
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = np.vstack(dist.fit_transform(X_dense_rdd).collect())
        assert_array_almost_equal(result_local, result_dist)

        result_local = local.fit_transform(X_sparse)
        result_dist = sp.vstack(dist.fit_transform(X_sparse_rdd).collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
        result_dist = sp.vstack(dist.fit_transform(Z_rdd)[:, 'X'].collect())
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
Beispiel #21
0
def conv2matrix(patients, target, sta_type='z-score', sel_type='var', threshold=.8, feature_k=30):
    '''
    extract feature from patient list, all of these features are not real feature, they are preprocessed by expert
    missing value: set 0 when style is 'z-score', and set nan when style is 'min-max'
    binary value feature: 0 means negative, and 1 means positive 
    category value feature: e.g. sex should be regarded one value as one featue 
    real value feature: age should be normalized
    rank value feature: could be set from 1 to n  
    the parameter of type is comprised of 'z-score' and 'min-max' 
    the parameter of threshold is between 0 and 1
    return patients_matrix_std: the standardization of patients matrix
    return features_dict: the dictionary consist of the names and indexes of features will be fed to classifier
    '''    
    
    #process the missing value and the rank value feature
    for i in range(len(patients)):
        for key in patients[i]:
            if patients[i][key] == '':
                patients[i][key] = 0  #process missing value 
            patients[i][key] = rank2int(patients[i][key])   #process rank value feature
    #process the category value feature and convert patients' dictionary to matrix
    vec = DictVectorizer()
    patients_matrix = vec.fit_transform(patients).toarray()          

    #feature selection
    if sel_type == 'var':
        sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
        patients_matrix_sel = sel.fit_transform(patients_matrix)
    elif sel_type == 'uni':
        sel = SelectKBest(chi2, feature_k)
        patients_matrix_sel = sel.fit_transform(patients_matrix, target)  
            
    features_dict = get_features(vec, sel)    
    #print(features_dict)
    
    #feature standardization
    if sta_type == 'z-score':
        patients_matrix_std = preprocessing.scale(patients_matrix)        
    elif sta_type == 'min-max':
        min_max_scaler = preprocessing.MinMaxScaler()
        patients_matrix_std = min_max_scaler.fit_transform(patients_matrix_sel)

    return patients_matrix_std, features_dict, patients_matrix
def feature_selection_with_scikit():
    """
    1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t
     meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in
     all samples.
    2-Univariate feature selection works by selecting the best features based on univariate statistical tests.
     It can be seen as a preprocessing step to an estimator
    """
    p=0.8
    selector = VarianceThreshold(threshold=(p * (1 - p)))
    c=selector.fit_transform(X)
    print  "Number of the attribute before: ",X.shape[1]
    print "number of the attribute after:",c.shape[1]

    # selecting k best attribute instead of chi2, f_classif can also be used
    skb=SelectKBest(chi2, k=10)
    X_new=skb.fit_transform(X, y)
    attr=np.where(skb._get_support_mask(),attributeNames,'-1')

    print "Best attribute choosen with SelectKBest: "
    i=1
    for att in attr:
        if att!='-1':
            print i, ": ",att
            i+=1

    #using  ExtraTreesClassifier
    print "Using feature importance..."
    etc=ExtraTreesClassifier()
    etc.fit(X,y).transform(X)
    print etc.feature_importances_
    print etc.max_features
    print etc.max_depth

    print "Recursive feature selection : "
    from sklearn.svm import SVC
    import sklearn.linear_model as lm
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Create the RFE object and compute a cross-validated score.
    estim=lm.LinearRegression()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Beispiel #23
0
    def run_pca_fct(X_file, data_str):
        from sklearn.decomposition import PCA
        import numpy as np
        import pylab as plt
        import os

        from sklearn.feature_selection import VarianceThreshold
        from sklearn.grid_search import GridSearchCV
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import Imputer

        X = np.load(X_file)

        # fixme pipe
        imp = Imputer()
        X = imp.fit_transform(X)

        # fixme? pipe
        # remove low variance features
        var_thr = VarianceThreshold()
        X = var_thr.fit_transform(X)


        # fixme pipe normalize
        normalize = MinMaxScaler()
        X = normalize.fit_transform(X)

        # fixme?
        # remove low variance features
        var_thr = VarianceThreshold()
        X = var_thr.fit_transform(X)

        pca = PCA()
        p = pca.fit_transform(X)
        explained_var = sum(pca.explained_variance_ratio_)
        plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_))
        plt.title('explained var: %s' % explained_var)
        plt.xlabel('n_components')
        plt.tight_layout()
        out_fig = os.path.join(os.getcwd(), 'pca_var_%s.pdf' % data_str)
        plt.savefig(out_fig)
        return out_fig
Beispiel #24
0
def feat_selec(tra_val_data, testing_data, thred=0.8):
    """
    Feature selection.
    """
    num_tv = tra_val_data.shape[0]

    total_data = np.vstack((tra_val_data, testing_data))

    selec = VarianceThreshold(threshold=thred)
    total_selected_data = selec.fit_transform(total_data)

    return total_selected_data[:num_tv, :], total_selected_data[num_tv:, :]
 def FeatureSelection( self ):
     """Main feature selection method"""
     
     if 'Variance' in self.FeatureSelectionMethod:
         selector = VarianceThreshold(threshold=0.0001)
         self.Features = selector.fit_transform(self.Features)
 #         pyplot.figure(), pyplot.hist(numpy.var(features, axis = 0), bins = 64), pyplot.show()
     elif 'Trees' in self.FeatureSelectionMethod:
         forestFeatures = ExtraTreesClassifier(n_estimators = 512, random_state = 32)
         forestFeaturesFit = forestFeatures.fit(self.Features, self.Classes)
         featureImportance = 0.001
         featureBool = (forestFeaturesFit.feature_importances_ > featureImportance)
         self.Features = self.Features[:,featureBool]
def test_variancethreshold_vs_sklearn():
    trajectories = AlanineDipeptide().get_cached().trajectories
    fs = FeatureSelector(FEATS)

    vt = VarianceThreshold(0.1)
    vtr = VarianceThresholdR(0.1)

    y = fs.partial_transform(trajectories[0])

    z1 = vt.fit_transform([y])[0]
    z_ref1 = vtr.fit_transform(y)

    np.testing.assert_array_almost_equal(z_ref1, z1)
def main():
    args = getOptions()
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)

    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
    
    #feature selection
    print "feature selection"
    ftsel = ExtraTreesClassifier()
    ftsel.fit(train_x_nor, train_y)
#     importances = ftsel.feature_importances_
#     indices_test = np.argsort(importances)[::-1]
#     indices_test = indices_test.tolist()
    train_x_trans = ftsel.transform(train_x_nor)
    test_x_trans = ftsel.transform(test_x_nor)
    
    #modelsing
    print "modelsing"
    train = xgb.DMatrix(train_x_trans,label=train_y)
    test = xgb.DMatrix(test_x_trans,label=test_y)
    gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train)
    train_pdt = gbm.predict(train)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = gbm.predict(test)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(test): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open("submission_xgbtrain.csv",'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index])))
    fout.close()
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
def getDataRisk(filepath, description, remove2gram):
   dictByUsers = load_object(filepath, "smoking_1_analytic_data_mapreduce.pkl")
   transformer = TfidfTransformer(use_idf=False)
   if dictByUsers[0] is not None:
      varSelector1 = VarianceThreshold(threshold=0.001)
      if remove2gram:
         mattmp0, mattmp1, tmp = rmv2gram(dictByUsers)
         sparseArrayVarianceFilter1 = varSelector1.fit_transform(mattmp0)
         sparseArrayVarianceFilter4 = varSelector1.fit_transform(mattmp1)
      else:
         sparseArrayVarianceFilter1 = varSelector1.fit_transform(dictByUsers[0])
         sparseArrayVarianceFilter4 = varSelector1.fit_transform(dictByUsers[1])
      transformer = TfidfTransformer(use_idf=False)
      sparseArrayRowNorm1 = transformer.fit_transform(sparseArrayVarianceFilter1)
      sparseArrayRowNorm4 = transformer.fit_transform(sparseArrayVarianceFilter4)
      y_all=np.array(dictByUsers[4])
      sparseArrayRowNorm = [hstack([sparseArrayRowNorm1,sparseArrayRowNorm4],format='csr')]
   if description:
      dictByUsers2 = load_object(filepath+"description_data/", "smoking_1_analytic_data_mapreduce.pkl")
      if dictByUsers2[0] is not None:
         varSelector1 = VarianceThreshold(threshold=0.001)
         if remove2gram:
            mattmp0, mattmp1, tmp = rmv2gram(dictByUsers2)
            mat1=matchDict(dictByUsers[0], mattmp0, dictByUsers[3], dictByUsers2[3])
            mat4=matchDict(dictByUsers[1], mattmp1, dictByUsers[3], dictByUsers2[3])
            del mattmp0
            del mattmp1
         else:
            mat1=matchDict(dictByUsers[0], dictByUsers2[0], dictByUsers[3], dictByUsers2[3])
            mat4=matchDict(dictByUsers[1], dictByUsers2[1], dictByUsers[3], dictByUsers2[3])
         sparseArrayVarianceFilter1 = varSelector1.fit_transform(mat1)
         sparseArrayVarianceFilter4 = varSelector1.fit_transform(mat4)
         sparseArrayRowNorm1_2=transformer.fit_transform(sparseArrayVarianceFilter1)
         sparseArrayRowNorm4_2=transformer.fit_transform(sparseArrayVarianceFilter4)
         sparseArrayRowNorm= sparseArrayRowNorm + [sparseArrayRowNorm1_2,sparseArrayRowNorm4_2]
   return processYX(y_all, sparseArrayRowNorm)
def test_variancethreshold_vs_sklearn():
    dataset = fetch_data()
    trajectories = dataset["trajectories"]

    fs = FeatureSelector(FEATS)

    vt = VarianceThreshold(0.1)
    vtr = VarianceThresholdR(0.1)

    y = fs.partial_transform(trajectories[0])

    z1 = vt.fit_transform([y])[0]
    z_ref1 = vtr.fit_transform(y)

    np.testing.assert_array_almost_equal(z_ref1, z1)
Beispiel #31
0
def problem3_3_2(data):
    selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
    selector.fit_transform(data)
    newdata = data.loc[:, selector.get_support()]
    return newdata.columns, newdata
Beispiel #32
0
PDXC.drop_duplicates(keep='last')
PDXC = pd.DataFrame.transpose(PDXC)
PDXC = PDXC.loc[:,~PDXC.columns.duplicated()]

GDSCM = pd.read_csv("GDSC_mutations.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
GDSCM = pd.DataFrame.transpose(GDSCM)


GDSCC = pd.read_csv("GDSC_CNV.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

PDXC = PDXC.fillna(0)
PDXC[PDXC != 0.0] = 1
PDXM = PDXM.fillna(0)
PDXM[PDXM != 0.0] = 1
GDSCM = GDSCM.fillna(0)
GDSCM[GDSCM != 0.0] = 1
GDSCC = GDSCC.fillna(0)
GDSCC[GDSCC != 0.0] = 1

ls = GDSCE.columns.intersection(GDSCM.columns)
ls = ls.intersection(GDSCC.columns)
ls = ls.intersection(PDXE.columns)
ls = ls.intersection(PDXM.columns)
Beispiel #33
0
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold

data = pd.read_csv(open("E:/ML/feature_selection _methods/santander-train.csv",
                        'rb'),
                   nrows=20000)

#data.head

X = data.iloc[:, :-1]

y = data.iloc[:, -1]

c_f = VarianceThreshold(threshold=0.01)

X_c_f = c_f.fit_transform(X)

X_c_f_T = X_c_f.T

X_c_f_T = pd.DataFrame(X_c_f_T)

a = X_c_f_T.duplicated().sum()

duplicated_features = X_c_f_T.duplicated()

features_to_keep = [not index for index in duplicated_features]

X_c_f_u = X_c_f_T[features_to_keep].T

# =============================================================================
# now calculating roc and auc score
Beispiel #34
0
#    i += 1
#plt.savefig(graphsDir + 'HFCR Feature Selection - VarianceThreshold')


original_data = data.copy()

labels_t = []
values = []
count = 0
for t in threshold_list:
    subDir = graphsDir + 'Threshold = ' + str(t) + '/'
    if not os.path.exists(subDir):
        os.makedirs(subDir)
    labels_t.append(t)
    sel = VarianceThreshold(threshold=t)
    sel.fit_transform(original_data.values)
    f_to_accept = sel.get_support()
    new_features = []
    for i in range(len(f_to_accept)):
        if f_to_accept[i]:
            new_features.append(i)
    print('t = ' + str(t) + ' / n_features = ' + str(len(new_features)))
    features_file.write('t = ' + str(t) + ": " + str(new_features) + "\n")
    values.append(len(new_features))

    data = original_data.copy()[new_features]

    variables = data.columns.values
    eixo_x = 0
    eixo_y = 4
    eixo_z = 7
#In this section, we will be removing columns that have a low variance uisng VarianceThreshold()
# example of apply the variance threshold. Note that our data is full of numerical data. So if we got
#a value of 2, it would be reasonable for a categorical variable, but unreasonable for a numerical value!
from pandas import read_csv
from sklearn.feature_selection import VarianceThreshold
# define the location of the dataset
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/oil-spill.csv'
# load the dataset
df = read_csv(path, header=None)
# split data into inputs and outputs.
data = df.values
X = data[:, :-1]
y = data[:, -1]
print(X.shape, y.shape)
# Here we are defining the VarianceThreshold
transform = VarianceThreshold()
# We are applying it the the X variable. We could see that they 1 is removed.
X_sel = transform.fit_transform(X)
print(X_sel.shape)
Beispiel #36
0
        min_array = np.append(min_array, v_min)
    return max_array, min_array

files = [i for i in os.listdir("../data/mipas_pd")]
files = files[19:24]
for file in files:
    #SVM classifier only PC
    df_data = pd.read_hdf(os.path.join('../data/mipas_pd', file),'df_btd')

new_file = h5py.File("../data/csdb_new/csdb_complete.h5", "r")
btd_csdb = new_file["btd_complete"][:]
labels = new_file["labels"][:]
new_file.close()

selector = VarianceThreshold(threshold=100)
mipas_var_sel = selector.fit_transform(df_data.iloc[:, 0:10011])
ind = selector.get_support()

df_mipas_var_sel = pd.DataFrame(mipas_var_sel)
csdb_var_sel = btd_csdb[:, ind]
df_csdb_var_sel = pd.DataFrame(csdb_var_sel)
var_mipas = df_mipas_var_sel.var()
var_csdb = df_csdb_var_sel.var()
mean_mipas = df_mipas_var_sel.mean()
mean_csdb = df_csdb_var_sel.mean()

#instantiate an SVM (with htang)
clf = svm.SVC()
clf.fit(csdb_var_sel, labels.ravel())

files = [i for i in os.listdir("../data/mipas_pd")]
Beispiel #37
0
# -*- coding: utf-8 -*-
# 载入数据
from sklearn.datasets import load_iris
iris = load_iris()
print("iris特征名称\n", iris.feature_names)
print("iris特征矩阵\n", iris.data)

# 特征选择--方差选择法
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=1)  # threshold为方差的阈值
vt = vt.fit_transform(iris.data)  # 函数返回值为特征选择后的数据
print("方差选择法选择的特征\n", vt)
Beispiel #38
0
def varianceSelection(X, THRESHOLD=10):
    from sklearn.feature_selection import VarianceThreshold
    sel = VarianceThreshold(threshold=THRESHOLD)
    sel.fit_transform(X)
    return X[[c for (s, c) in zip(sel.get_support(), X.columns.values) if s]]
Beispiel #39
0
def example():
    """
    ========
    集成方法
    ========
    """
    # bagging 方法
    # 使用均匀取样,每个样例的权重相等 平均预测
    from sklearn.ensemble import BaggingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

    # 随机森林 (bagging + dt 随机抽样训练样本)
    from sklearn.ensemble import RandomForestClassifier
    X = [[0, 0], [1, 1]]
    Y = [0, 1]
    clf = RandomForestClassifier(n_estimators=10)
    clf = clf.fit(X, Y)

    # 极端随机森林(每次利用全部样本,训练,但分叉属性的划分值完全随机进行左右分叉)
    from sklearn.model_selection import cross_val_score
    from sklearn.datasets import make_blobs
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.tree import DecisionTreeClassifier

    X, y = make_blobs(n_samples=10000,
                      n_features=10,
                      centers=100,
                      random_state=0)

    clf = DecisionTreeClassifier(max_depth=None,
                                 min_samples_split=2,
                                 random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    clf = RandomForestClassifier(n_estimators=10,
                                 max_depth=None,
                                 min_samples_split=2,
                                 random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    clf = ExtraTreesClassifier(n_estimators=10,
                               max_depth=None,
                               min_samples_split=2,
                               random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    # #######################################################
    # boosting 提升方法(通常弱分类器组合)(不可并行)
    # 选部分数据作为第一次训练集,分错样本+剩余训练数据作为下一次训练集,循环,分类好的分类器权重大

    # AdaBoost
    from sklearn.model_selection import cross_val_score
    from sklearn.datasets import load_iris
    from sklearn.ensemble import AdaBoostClassifier

    iris = load_iris()
    clf = AdaBoostClassifier(n_estimators=100)
    scores = cross_val_score(clf, iris.data, iris.target, cv=5)
    print(scores.mean())

    # GradientBoosting 梯度提升(通常弱分类器组合)
    # 样例:https://www.cnblogs.com/peizhe123/p/5086128.html
    from sklearn.datasets import make_hastie_10_2
    from sklearn.ensemble import GradientBoostingClassifier

    X, y = make_hastie_10_2(random_state=0)
    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    clf = GradientBoostingClassifier(n_estimators=100,
                                     learning_rate=1.0,
                                     max_depth=1,
                                     random_state=0).fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    """
    =============
    多标签、多分类
    =============
    """
    # 多标签形式
    from sklearn.preprocessing import MultiLabelBinarizer
    y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
    print(MultiLabelBinarizer().fit_transform(y))

    # 利用ovr,ovo多分类
    from sklearn import datasets
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.svm import LinearSVC
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    clf_ovr = OneVsRestClassifier(LinearSVC())
    print(clf_ovr.fit(X, y).predict(X))
    # 利用ovr可以进行多标签预测
    # also supports multilabel classification. To use this feature,
    # feed the classifier an indicator matrix, in which cell [i, j] indicates the presence of label j in sample i

    clf_ovo = OneVsOneClassifier(LinearSVC())
    print(clf_ovo.fit(X, y).predict(X))
    """
    =============
    特征选择
    =============
    """

    # 1.方差移除
    from sklearn.feature_selection import VarianceThreshold
    X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    print(sel.fit_transform(X))

    # 2.单变量特征选择
    #   SelectKBest
    # 	SelectPercentile
    # 	SelectFpr, SelectFdr, SelectFwe
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
    print(X_new.shape)

    # scores_和pvalues_
    # p值越小,拒绝原假设,原假设:该特征和y不相关
    skb = SelectKBest(chi2, k=2).fit(X, y)
    print(skb.scores_)
    print(skb.pvalues_)

    # source function
    """
    For regression: f_regression, mutual_info_regression
    For classification: chi2, f_classif, mutual_info_classif
    """

    # 3.递归特征消除
    from sklearn.svm import SVC
    from sklearn.datasets import load_digits
    from sklearn.feature_selection import RFE
    import matplotlib.pyplot as plt

    # Load the digits dataset
    digits = load_digits()
    X = digits.images.reshape((len(digits.images), -1))
    y = digits.target

    # Create the RFE object and rank each pixel
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
    rfe.fit(X, y)
    ranking = rfe.ranking_.reshape(digits.images[0].shape)

    # Plot pixel ranking
    plt.matshow(ranking, cmap=plt.cm.Blues)
    plt.colorbar()
    plt.title("Ranking of pixels with RFE")
    plt.show()

    # 4.1 SelectFromModel
    from sklearn.svm import LinearSVC
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectFromModel
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X)
    print(X_new.shape)

    # 4.2 树结构 feature_importances_
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectFromModel
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    print(clf.feature_importances_)

    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    print(X_new.shape)
    """
    =============
    神经网络
    =============
    """
    # 神经网络,参数讲解
    from sklearn.neural_network import MLPClassifier
    X = [[0., 0.], [1., 1.]]
    y = [0, 1]
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(5, 2),
                        random_state=1)

    print(clf.fit(X, y))
    print(clf.predict([[2., 2.], [-1., -2.]]))
    print(clf.coefs_)
    print(clf.intercepts_)
    print(clf.loss_)
Beispiel #40
0
    for row in readCSV:
        X.append(list(map(float, row[2:])))
        Y.append(float(row[1]))
csvfile.close()

with open('test_vale.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    next(readCSV, None)  # skip the header
    for row in readCSV:
        X_test.append(list(map(float, row[1:])))
csvfile.close()
print("files loaded")

# simple variance based feature selection
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
Xselected = sel.fit_transform(X)
Xselected_test = sel.fit_transform(X_test)

# do SVM with degree d
clf = svm.SVC(kernel="poly", degree=3, cache_size=200
              )  #linear,poly,rbf,sigmoid, precomputed  kernel='poly',degree=3
clf.fit(Xselected, Y)
Y_test_predict = clf.predict(Xselected_test)

f = open('output_degree' + '.csv', 'w')
f.write("Id,y\n")
q = 2000
for v in Y_test_predict:
    st = str(q) + "," + str(int(round(v))) + "\n"
    #print(st)
    q += 1
# Display new class counts
df_downsampled.click_out.value_counts()
df_downsampled.click_out.value_counts()

########rachel copy ends here

#determining which columns are part of data and which is the prediction. Removed the user_id and session_id.
x = data.loc[:, 'mobile':'duration_sec']
y = data.loc[:, 'click_out']

#number of futures before feature selection
len(x)
np.size(x, 1)

selector = VarianceThreshold(threshold=0.9)
x = selector.fit_transform(x)
x

#split data sets into training and testing
x = preprocessing.scale(x)
test_size = 0.5
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=1)

##number of fuatures after feature selection
len(x_train)
np.size(x_train, 1)

#choosing a K Value
Beispiel #42
0
# GDSC_exprs_B = pd.DataFrame.transpose(GDSC_exprs_B)
# GSE55145_exprs_B = pd.DataFrame.transpose(GSE55145_exprs_B)
# GSE9782_exprs_B = pd.DataFrame.transpose(GSE9782_exprs_B)

GDSC_exprs_z = pd.DataFrame.transpose(GDSC_exprs_z)
GSE1_exprs_z = pd.DataFrame.transpose(GSE1_exprs_z)
GSE2_exprs_z = pd.DataFrame.transpose(GSE2_exprs_z)
GSE3_exprs_z = pd.DataFrame.transpose(GSE3_exprs_z)
GSE4_exprs_z = pd.DataFrame.transpose(GSE4_exprs_z)
TCGA_exprs_z = pd.DataFrame.transpose(TCGA_exprs_z)
#

# Remove genes with low signal (i.e. below the variance threshold) from expression data

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSC_exprs_z)
GDSC_exprs_z = GDSC_exprs_z[GDSC_exprs_z.columns[selector.get_support(
    indices=True)]]
ls = GSE1_exprs_z.columns.intersection(GDSC_exprs_z.columns)
ls = ls.intersection(GSE2_exprs_z.columns)
ls = ls.intersection(GSE3_exprs_z.columns)
ls = ls.intersection(GSE4_exprs_z.columns)
ls = ls.intersection(TCGA_exprs_z.columns)
GSE1_exprs_z = GSE1_exprs_z.loc[:, ls]
GSE2_exprs_z = GSE2_exprs_z.loc[:, ls]
GSE3_exprs_z = GSE3_exprs_z.loc[:, ls]
GSE4_exprs_z = GSE4_exprs_z.loc[:, ls]
TCGA_exprs_z = TCGA_exprs_z.loc[:, ls]

# Obtain selected genes
GDSC_exprs_z_genes = list(GDSC_exprs_z.columns.values)
            seq[i] = replacement
            


train_data = np.loadtxt('Train_Data.csv', dtype=np.float32, delimiter=',')
train_labels = np.loadtxt('Train_Labels.csv', dtype=np.int32, delimiter=',')
test_data = np.loadtxt('Test_Data.csv', dtype=np.float32, delimiter=',')
test_labels = np.loadtxt('Test_Labels.csv', dtype=np.int32, delimiter=',')
class_names = ['1', '2', '3']


# Feature Selection
all_data = np.vstack((train_data,test_data))
all_data_labels=np.hstack((train_labels,test_labels))
sel = VarianceThreshold(threshold=0.90*(1-0.90))
all_data = sel.fit_transform(all_data)
all_data_size, _ = all_data.shape
_, feature_size = all_data.shape

clustering = AffinityPropagation(preference= -1200,damping=0.92).fit(all_data)



tmp = clustering.labels_
replace_all(tmp,0,10)
replace_all(tmp,1,20)
replace_all(tmp,2,30)


replace_all(tmp,10,1)
replace_all(tmp,20,3)
# COMMAND ----------

# MAGIC %md Create an instance of the `VarianceThreshold` class and store it in an object `selector`. Set threshold to default (`0`). 

# COMMAND ----------

selector = VarianceThreshold()

# COMMAND ----------

# MAGIC %md Fit the transformer class to the sample data array `X` and return a transformed version of `X`. The `fit` method records the variance of each feature from `X`. The `transform` method returns the selected features from `X`.

# COMMAND ----------

selector.fit_transform(X)

# COMMAND ----------

# MAGIC %md With the default setting for threshold, the two column features with variance above 0 are selected.

# COMMAND ----------

# MAGIC %md The transformer class `VarianceThreshold` has attribute `variances_`. Use it to see variances of individual features of `X`. They are the same as in the output of `np.var(X, axis=0)`. 

# COMMAND ----------

selector.variances_

# COMMAND ----------
Beispiel #45
0
# ライブラリをロード
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

# テスト用のデータをロード
iris = datasets.load_iris()

# 特徴量とターゲットを作成
features = iris.data
target = iris.target

# 閾値を作成
thresholder = VarianceThreshold(threshold=.5)

# 分散の大きい特徴量行列を作成
features_high_variance = thresholder.fit_transform(features)

# 分散の大きい特徴量行列を表示
features_high_variance[0:3]

##########

# 分散を表示
thresholder.fit(features).variances_

##########

# ライブラリをロード
from sklearn.preprocessing import StandardScaler

# 特徴量行列を標準化
Beispiel #46
0
Created on Wed Dec  4 03:54:27 2019

@author: 43884
"""
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE

df = pd.read_excel('./processed_data2.xlsx').values


x = df[:, :-1]
labels = df[:, -1]
sel = VarianceThreshold(threshold=0.2)
x_transform = sel.fit_transform(x)
x_transform.shape
sel.get_support(indices = False)



x_train, x_rest, y_train, y_rest = train_test_split(df[:, :-1], df[:, -1], test_size = 0.4)
x_val, x_test, y_val, y_test = train_test_split(x_rest, y_rest, test_size = 0.5) 

#before oversampling
print(pd.Series(y_train).value_counts()/len(y_train))
smo = SMOTE(random_state=42)
x_smo, y_smo = smo.fit_sample(x_train, y_train)
#after oversampling
print(pd.Series(y_smo).value_counts()/len(y_smo))
Beispiel #47
0
print("accuracy for test set is: ", accuracy_score(y_test, xgb_r_test))
xgb_imp_r = xgb_r.feature_importances_
plt.hist(xgb_imp_r, bins='auto')
plt.title("Histogram with 'feature_importances'")
plt.xscale('log')
plt.xlabel("Impotance")
plt.ylabel("Counts of Features")
plt.savefig('his_feature_importance.png')
plt.show()
plt.bar(range(len(xgb_imp_r)), xgb_imp_r, 2)
plt.yscale('symlog')
plt.show()
from sklearn.feature_selection import VarianceThreshold
# delete features that have same values in all class
sel = VarianceThreshold()  # can specify: 'threshold=(.8 * (1 - .8))'
x = sel.fit_transform(x_train_2)
x.shape
# Calculate pearson correlation Coefficient
pd_train = pd.DataFrame(x_train_2)
pd_test = pd.DataFrame(x_test_2)
pd_Test = pd.DataFrame(X_test_2)  # for final prediction, or 'X_test'
pd_Train = pd.DataFrame(X_train_2)  # for final prediction, or 'X_train'
pd_train_pear = pd_train.corr(method="pearson")
plt.figure(figsize=(30, 30))
sb.set(font_scale=0.7)
sb.heatmap(abs(pd_train_pear), cmap="YlGn", annot=False)
plt.title("A Quick Look at the Correlations among Predictors", fontsize=20)
plt.savefig('heatmap_pearson.pdf')
plt.show()
pd_train_1 = pd_train.copy()
pd_test_1 = pd_test.copy()
# Load library
from sklearn.feature_selection import VarianceThreshold
# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)
Beispiel #49
0
    rus = RandomUnderSampler(random_state=seeds, replacement=True)
    X_train, y_train = rus.fit_sample(X_train, y_train)
    radioFeat_train = copy.deepcopy(X_train[:, :1692])
    clinical_semanticFeat_train = copy.deepcopy(X_train[:, 1692:])
    radioFeat_test = copy.deepcopy(X_test.iloc[:, :1692])
    clinical_semanticFeat_test = copy.deepcopy(X_test.iloc[:, 1692:])
    print('------------------开始特征选择---------------------')
    print('radiomics原始特征个数为:{}'.format(radioFeat_train.shape[1]))
    print('clinical_semantic原始特征个数为:{}'.format(
        clinical_semanticFeat_train.shape[1]))

    ##################方差特征选择################
    from sklearn.feature_selection import VarianceThreshold  # 导入python的相关模块
    vad = VarianceThreshold(
        threshold=0.01)  # 表示剔除特征的方差大于阈值的特征Removing features with low variance
    radioFeat_train = vad.fit_transform(radioFeat_train)  # 返回的结果为选择的特征矩阵
    radioFeat_test = vad.transform(radioFeat_test)
    print('train_test_split_seed={} 方差选择radiomics特征个数为:{}'.format(
        seeds, radioFeat_train.shape[1]))

    ######################特征归一化到【-1,1】之间#####################
    # max_abs_scaler = preprocessing.MaxAbsScaler()
    # max_abs_scaler.fit(xmantrain)
    # xabstrain = max_abs_scaler.transform(xmantrain)
    # xabstest = max_abs_scaler.transform(xmantest)
    ##################方差特征选择################
    # from sklearn.feature_selection import VarianceThreshold  # 导入python的相关模块
    # sel = VarianceThreshold(threshold=0.01)  # 表示剔除特征的方差大于阈值的特征Removing features with low variance
    # ss = sel.fit(xmantrain)  # 返回的结果为选择的特征矩阵
    # xvartrain = sel.transform(xmantrain)
    # xvartest = sel.transform(xmantest)
Beispiel #50
0
・Filter Method
・Wrapper Method
・Emedded Method

Filter Methodは、大別して3つある
・特徴量の値のみ
・特徴量間の相関係数
・統計的評価指標

# 特徴量のみ
・分散がゼロ => 全て同じ値 => 削減

from sklearn.feature_selection import VarianceThreshold
X = desc_df.values
select = VarianceThreshold()
X_new = select.fit_transform(X)

np.array(descs)[select.get_support()==False]  # 削減後の特徴量の数を確認

・分散がほぼゼロ => データをよく観察して削除するか判断

・特徴量がほかの特徴量と完全に一致 


# 特徴量間の相関係数
メリット
・互いに相関の高い特徴量の片方を削除することで、精度にあまり影響を与えずに特徴量空間の次元を下げる
・線形モデルの解釈性を上げることができる。

ピアソン相関係数(いわゆる普通の相関係数)
Beispiel #51
0
print('First white-matter anatomy image (3D) is located at: %s' %
      oasis_dataset.white_matter_maps[0])  # 3D data

#############################################################################
# Preprocess data
# ----------------
nifti_masker = NiftiMasker(standardize=False,
                           smoothing_fwhm=2,
                           memory='nilearn_cache')  # cache options
gm_maps_masked = nifti_masker.fit_transform(gm_imgs_train)

# The features with too low between-subject variance are removed using
# :class:`sklearn.feature_selection.VarianceThreshold`.
from sklearn.feature_selection import VarianceThreshold
variance_threshold = VarianceThreshold(threshold=.01)
gm_maps_thresholded = variance_threshold.fit_transform(gm_maps_masked)
gm_maps_masked = variance_threshold.inverse_transform(gm_maps_thresholded)

# Then we convert the data back to the mask image in order to use it for
# decoding process
mask = nifti_masker.inverse_transform(variance_threshold.get_support())

############################################################################
# Prediction pipeline with ANOVA and SVR using
# :class:`nilearn.decoding.DecoderRegressor` Object

# In nilearn we can benefit from the built-in DecoderRegressor object to
# do ANOVA with SVR instead of manually defining the whole pipeline.
# This estimator also uses Cross Validation to select best models and ensemble
# them. Furthermore, you can pass n_jobs=<some_high_value> to the
# DecoderRegressor class to take advantage of a multi-core system.
def feature_selection_with_plots(X_train, y_train):
    # initial point
    print("Current N of features:", len(X_train.columns))
    print(" ")

    # removing features with zero variance
    print("\033[1m" + "Remove features with zero variance" + "\033[0m")
    selector = VarianceThreshold()
    selector.fit_transform(X_train)
    selected_columns = X_train.columns[(selector.get_support())]
    print("N of dropped columns:", len(set(X_train.columns) - set(selected_columns)))

    X_train = X_train[selected_columns]
    print("Current N of features:", len(X_train.columns))

    # Tree-based feature selection
    clf = ExtraTreesClassifier(n_estimators=50, random_state=333)
    clf = clf.fit(X_train, y_train)

    # feature importance
    feature_importance = clf.feature_importances_.ravel()
    feature_names = X_train.columns
    data_tuples = list(zip(feature_names, feature_importance))
    features = pd.DataFrame(data_tuples, columns=["feature_names", "feature_importance"])

    # plot top n features sorted by feature importance
    n = 30
    fe = features.sort_values(["feature_importance"], ascending=False).reset_index(drop=True)
    fe = fe.head(n)
    fe = fe.sort_values(["feature_importance"], ascending=True).reset_index(drop=True)

    fig = plt.figure(figsize = [12,7])
    ax = fig.add_axes([0,0,1,1])

    data = fe["feature_importance"].values
    names = fe["feature_names"].values
    y_pos = np.arange(len(names))

    plt.barh(y_pos, data, color = "darkgreen")
    plt.yticks(y_pos, names)

    plt.title("Top "+str(n)+ " features")
    plt.xlabel("feature importance")
    plt.ylabel("column name")
    plt.savefig("figures/Top"+str(n)+ "features.png", bbox_inches = "tight")
    plt.show()
    
    print("\033[1m" + "Tree-based feature selection" + "\033[0m")
    selector = SelectFromModel(clf, prefit=True)
    selected_columns = X_train.columns[(selector.get_support())]
    print("N of dropped columns:", len(set(X_train.columns) - set(selected_columns)))

    X_train = X_train[selected_columns]
    print("Current N of features:", len(X_train.columns))

    corr = abs(X_train.corr())

    plt.figure(figsize=(12,12))
    sns.heatmap(corr, square = True)
    plt.title("Correlation Matrix after tree-based feature selection", fontsize = 15)
    plt.savefig("figures/cm_after_1stFS.png", bbox_inches = "tight")
    plt.show()
    
    # drop columns highly correlated between each-other and choose the one with higher feature importance
    print("\033[1m" + "Drop highly correlated features" + "\033[0m")
    correlations = []
    feature_tuples = []
    for col in X_train.columns:
        for row in X_train.columns:
            correlation = corr.loc[row, col]
            if row == col:
                pass
            elif (col, row) in feature_tuples:
                pass
            elif correlation >= 0.7:
                correlations.append(correlation)
                feature_tuples.append((row, col))

    drop = []
    for tup in feature_tuples:
        f0 = tup[0]
        f1 = tup[1]
        imp_f0 = features[features["feature_names"] == f0]["feature_importance"].values
        imp_f1 = features[features["feature_names"] == f1]["feature_importance"].values
        if imp_f0 <= imp_f1:
            drop.append(f0)
        else:
            drop.append(f1)
    drop = set(drop)

    print("N of dropped features:", len(drop))

    selected_columns = list(set(X_train.columns) - set(drop))
    X_train = X_train[selected_columns]

    print("Current N of features:", len(X_train.columns))

    corr = abs(X_train.corr())
    plt.figure(figsize=(12,12))
    sns.heatmap(corr, square = True, annot = True, fmt = ".2")
    plt.title("Final Correlation Matrix", fontsize = 15)
    plt.savefig("figures/cm_after_2ndFS.png", bbox_inches = "tight")
    plt.show()
    
    return X_train
Beispiel #53
0
def VarianceThreshold_demo():
    X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
    sel = VarianceThreshold(threshold=(0.2))
    y = sel.fit_transform(X)
    print(y)
def main():
    args = getOptions()
    print args
    if args.model == 'gBoosting':
        fn = ("submissionv4_%s_gBoosting_%s_%s_%s_%s_%s.csv" %
              (args.fts, args.loss, str(args.minsamplessplit), str(
                  args.lrate).replace('.', 'dian'), str(
                      args.nest), str(args.maxdepth)))
    elif args.model == 'randomForest':
        fn = ("submissionv4_%s_randomForest_%s.csv" % (args.fts, args.nest))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train, 'train')
    train_x_new, id = extractID(train_x)
    train_x_clean, contentdict = cityclean(train_x_new)

    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test, 'test')
    test_x_new, id = extractID(test_x)
    test_x_clean, contentdict = cityclean(test_x_new, contentdict)
    del contentdict
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"

    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    test_x_uniq = sel.transform(test_x_clean)
    #     indices = [i for i in range(len(train_x[0]))]
    #     frqIndex = trimfrq(train_x)
    #     for i in frqIndex:
    #         indices.remove(i)
    #     train_x_uniq = indexTodata(train_x, indices)
    #     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y,
                                                    test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y,
                                                   test_x_nor)
    elif args.fts == 'randomTree':
        train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y,
                                                    test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    print len(train_x_nor[0])
    print len(train_x_sel[0])

    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    if args.model == 'gBoosting':
        clf = GradientBoostingClassifier(
            loss=args.loss,
            learning_rate=args.lrate,
            n_estimators=args.nest,
            max_depth=args.maxdepth,
            min_samples_split=args.minsamplessplit,
            verbose=1)
    elif args.model == 'randomForest':
        clf = RandomForestClassifier(n_estimators=args.nest,
                                     class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
    #     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt)
    #     print "MCC, Acc_p , Acc_n, Acc_all(test): "
    #     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))

    fout = open(fn, 'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])), str(test_pdt[index][1])))
    fout.close()
Beispiel #55
0
 def var(self, threshold):
     sel = VarianceThreshold(threshold)
     self.X = sel.fit_transform(self.x)
     return self.X
Beispiel #56
0
    def randomForestClassifier(self,
                               train_cols,
                               test_cols,
                               targets,
                               feature_selction_var,
                               min_abundance_threshold,
                               shuffle=False):
        """ run random forest classification """
        from sklearn.ensemble import RandomForestClassifier
        #from sklearn.ensemble import RandomForestRegressor

        #train = self.abundance_df.loc[:,train_cols] #train.as_matrix(cols)
        train = self.abundance_df[
            self.abundance_df['masked'] ==
            False].loc[:, train_cols]  #train.as_matrix(cols)
        #test = self.abundance_df.loc[:,test_cols] #.as_matrix(test_cols)
        test = self.abundance_df[self.abundance_df['masked'] ==
                                 False].loc[:,
                                            test_cols]  #.as_matrix(test_cols)
        #names = list(self.abundance_df.loc[:, 'species'])
        names = list(self.abundance_df[self.abundance_df['masked'] ==
                                       False].loc[:, 'species'])

        #most_common_species_set = set()
        #for col in train_cols:
        #    sorted_series = self.abundance_df.loc[:, col].sort_values(ascending=False)[:100]
        #    most_common_species_set |= set(list(sorted_series.index))
        #most_common_species_list = []
        #for id0 in most_common_species_set:
        #    #print(max(self.abundance_df.loc[id0,train_cols]))
        #    if max(self.abundance_df.loc[id0,train_cols]) >= min_abundance_threshold:
        #        most_common_species_list.append(id0)
        ##print(len(most_common_species_list))
        #most_common_species_set = set(most_common_species_list)
        #train = train.loc[list(most_common_species_set),:]
        #test = test.loc[list(most_common_species_set),:]
        #names = list(self.abundance_df.loc[list(most_common_species_set),'species'])

        #feature selection by variance
        from sklearn.feature_selection import VarianceThreshold
        sel = VarianceThreshold(threshold=(0.999 * (1 - 0.999)))
        if feature_selction_var:
            #ds1 = np.transpose(ds10.as_matrix())
            #ds1 = sel.fit_transform(np.transpose(ds10.as_matrix()))
            #ds2 = np.transpose(ds20.as_matrix())
            #train = sel.fit_transform(np.transpose(train.as_matrix()))
            train = sel.fit_transform(np.transpose(train.values))

            #names = list(self.abundance_df.loc[:, 'species'].as_matrix()[sel.get_support()])
            #names = list(self.abundance_df[self.abundance_df['masked']==False].loc[:, 'species'].as_matrix()[sel.get_support()])
            names = list(
                self.abundance_df[self.abundance_df['masked'] == False].
                loc[:, 'species'].values[sel.get_support()])
            #test = sel.fit_transform(np.transpose(test.as_matrix()))
            test = sel.fit_transform(np.transpose(test.values))
            ds10 = np.asmatrix(
                train)[[i for i, j in enumerate(targets) if j == 0], :]
            ds1 = np.transpose(sel.fit_transform(np.transpose(ds10)))
        else:

            #train = np.transpose(train.as_matrix())
            train = np.transpose(train.values)
            #test = np.transpose(test.as_matrix())
            test = np.transpose(test.values)
            ds10 = train.iloc[:, [i for i, j in enumerate(targets) if j == 0]]
            #ds1 = np.transpose(ds10.as_matrix())
            ds1 = np.transpose(ds10.values)

        if shuffle == 'index':
            from random import shuffle
            shuffle(names)

        #rf = RandomForestClassifier(n_estimators=10)
        target = targets
        #group1 = list(self.abundance_df.loc[:,train_cols].columns[:target.count(0)])
        group1 = list(
            self.abundance_df[self.abundance_df['masked'] ==
                              False].loc[:,
                                         train_cols].columns[:target.count(0)])
        #group2 = list(self.abundance_df.loc[:,train_cols].columns[target.count(0):])
        group2 = list(
            self.abundance_df[self.abundance_df['masked'] ==
                              False].loc[:,
                                         train_cols].columns[target.count(0):])

        #rf = RandomForestRegressor(n_estimators=1000)#, class_weight="balanced")
        rf = RandomForestClassifier(n_estimators=1000)  # bootstrap=False
        #, max_features=100)#, min_sample_leaf=50)
        #rf = RandomForestRegressor(n_estimators=20, max_features=2)
        #class_weight="balanced" #{class_label: weight}
        #n_estimators=1000,
        rf.fit(train, target)

        #from sklearn.metrics import roc_auc_score
        #for l in leaf:
        #model = RandomForestRegressor(min_samples_split=2, max_depth=None, bootstrap=False, min_samples_leaf=2)
        #    #n_estimator=200, oob_score=True, min_samples_leaf=10,max_features=f,
        #model.fit(train,target)
        #    #print("AUC - ROC : ")
        #    #print(roc_auc_score(target,model.oob_prediction_))
        #    #print(model.feature_importances_)

        #from sklearn.ensemble import ExtraTreesClassifier
        #model = ExtraTreesClassifier()
        #model.fit(train, target)

        from treeinterpreter import treeinterpreter as ti
        prediction, bias, contributions = ti.predict(rf, np.array(train))

        #for i in range(len(train)):
        #    j = 0
        # #   print(i)
        #    #print("\tBias (trainset mean)")
        #    #print(bias[i])
        # #   print(contributions[0][0])
        #    #for c, feature in sorted(zip(contributions[i],
        #    #                             names),
        #    #                            #self.abundance_df.index),
        #    #                         key=lambda x: -abs(x[0])):
        #    for c, feature in zip(contributions[i], list(self.abundance_df.index)):
        #        if c[0] != 0:
        #        #print feature, ':\t', "{:.2e}".format(c), '\t', self.abundance_df.loc[feature, 'species']
        #            if j <10:
        #  #              print()'\t' + self.abundance_df.loc[feature, 'species'], '\t', "{:.2e}".format(c[0]))
        #                j += 1
        totalc = np.mean(contributions, axis=0)

        #from sklearn import model_selection
        #from sklearn.model_selection import cross_val_score
        #clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
        #scores = cross_val_score(clf, X, y)

        ##compare 2 groups of samples
        prediction1, bias1, contributions1 = ti.predict(rf, np.array(ds1))

        mean_contri = [0 for i in xrange(len(names))]
        for s in xrange(len(ds1)):
            for i in xrange(len(names)):
                mean_contri[i] += contributions1[s][i][0]
        mean_contri = [x / len(ds1) for x in mean_contri]

        names_list = []
        #for c, org in sorted(zip(mean_contri, list(self.abundance_df.loc[:,'species'])), reverse=True):
        for c, org in sorted(zip(mean_contri, names), reverse=True):
            if c != 0:
                #print(self.abundance_df.loc[i,group1])
                #idx = self.abundance_df[self.abundance_df['species'] == org].index.tolist()[0]
                idx = self.abundance_df[self.abundance_df['masked'] == False][
                    self.abundance_df['species'] == org].index.tolist()[0]
                if shuffle:
                    #print(names.index(org))
                    #idx = list(self.abundance_df.index)[names.index(org)]
                    idx = list(
                        self.abundance_df[self.abundance_df['masked'] ==
                                          False].index)[names.index(org)]
                #maximum = max(self.abundance_df.loc[idx,group1 + group2])
                maximum = max(self.abundance_df[self.abundance_df['masked'] ==
                                                False].loc[idx,
                                                           group1 + group2])
                #print(str(round(c, 3)) + '\t' + org + '\t' + str(round(maximum,3)))
                names_list.append([round(c, 3), org, round(maximum, 3)])

        return names_list
def do_gleason(t_data, filenames, mode):
	# FEATURE SELECTION
	# Select K-Best Features, Scale, VarianceThreshold and Select From RF Model
	kbest = SelectKBest(score_func=chi2, k=15000)
	scaler = StandardScaler()
	thresholding = VarianceThreshold()
	fs_data = []
	for i, d in enumerate(t_data):
		print("\nFILENAME: {}".format(filenames[i]))
		t_rows = list(d.index)
		t_columns = d.columns[:-3]
		# K-best
		selector = kbest.fit(d.iloc[:, :-3], d.iloc[:, -2])
		t_columns = t_columns[selector.get_support()]
		fs_data.append(pd.DataFrame(selector.transform(t_data[i].iloc[:, :-3]), columns = t_columns, index=t_rows))
		if mode == 'show':
			print("Selecting k best features -\n", fs_data[i].head())
		# Scale 
		t_columns = fs_data[i].columns
		fs_data[i] = pd.DataFrame(scaler.fit_transform(fs_data[i]), columns=t_columns, index=t_rows)
		if mode == 'show':
			print("Scaling data -\n", fs_data[i].head())
		# Variance Threshold
		fs_data[i] = pd.DataFrame(thresholding.fit_transform(fs_data[i]), columns=t_columns, index=t_rows)
		if mode == 'show':
			print("After variance thresholding -\n", fs_data[i].head())
		# Select from RF
		classifier = RandomForestClassifier(n_estimators=1)
		classifier = classifier.fit(fs_data[i], d['Gleason'])
		selector = SelectFromModel(classifier, prefit=True)
		t_columns = t_columns[selector.get_support()]
		fs_data[i] = pd.DataFrame(selector.transform(fs_data[i]), columns=t_columns, index=t_rows)
		fs_data[i]['Gleason'] = d['Gleason']
		if mode in ('show'):
			print("Selecting data from RF model -\n", fs_data[i].head())
		print("Shape after feature selection: {}".format(fs_data[i].shape), end="\n\n")
	# RESAMPLING data - SMOTEENN
	balanced_data = [[] for _ in range(2)]
	for i, d in enumerate(fs_data):
		sme = SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=1))
		x, y = sme.fit_resample(fs_data[i], t_data[i]['Gleason'])
		# x are the features and y are the targets
		balanced_data[i].append(x)
		balanced_data[i].append(y)
		if mode == 'show':
			print("FILENAME: {}".format(filenames[i]), Counter(balanced_data[i][1]))
	# DIMENSIONALITY REDUCTION
	# Kernel PCA and LDA (can be toggled on or off)
	pca = False
	pca_dim = 31
	lda = True
	lda_dim = 3
	if pca or lda:
		dr_data = []
		for i in range(len(filenames)):
			print("\nFILENAME: {}".format(filenames[i]))
			if pca:
				decomposer = KernelPCA(n_components=pca_dim, kernel='rbf', gamma=0.05, degree=7)
				dr_data.append(decomposer.fit_transform(balanced_data[i][0]))
				print("Shape and type after PCA: ", dr_data[i].shape, type(dr_data[i]))
			else:
				dr_data.append(balanced_data[i][0])
			if lda:
				decomposer = LinearDiscriminantAnalysis(n_components=lda_dim)
				dr_data[i] = decomposer.fit_transform(dr_data[i], balanced_data[i][1])
				print("Shape and type after LDA: ", dr_data[i].shape, type(dr_data[i]))
	else:
		dr_data.append(balanced_data[0][0])
		dr_data.append(balanced_data[1][0])
	# CLASSIFICATION
	splits = 10
	seed = 7
	kfold = KFold(n_splits=splits, random_state=seed, shuffle=True)
	results = {'SVM': [],
				'RF': [],
				'KNN': [],
				'NB': []
				}
	for i, d in enumerate(dr_data):
		# SVM
		res = []
		classifier = SVC(gamma='auto')
		results['SVM'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['SVM'][i] = results['SVM'][i].mean()
		# RF
		# rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,max_depth=10,max_features='auto')
		classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=7, max_features='auto', criterion='gini') #, n_jobs=-1
		results['RF'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['RF'][i] = results['RF'][i].mean()
		# KNN
		k_scores = []
		for n in range(1, 16):
			knn = KNeighborsClassifier(n_neighbors=3)
			scores = (cross_val_score(knn, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
			k_scores.append(scores.mean())
		results['KNN'].append(max(k_scores))
		# NB
		nb = GaussianNB()
		results['NB'].append(cross_val_score(nb, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['NB'][i] = results['NB'][i].mean()
	print("\nFinal Results for datasets: {0}, {1} -".format(filenames[0], filenames[1]))
	pprint(results)
	# PLOTTING
	# PCA
	pca = PCA(n_components = 3)
	x_pca = pca.fit_transform(balanced_data[0][0])
	fig = plt.figure(figsize=(13, 7))
	plt.suptitle("3-D plot for resampled data using dimesnionality reduction (Gleason Score)\n\n")
	ax = fig.add_subplot(121, projection='3d')
	ax.set_title("PCA\n\n")
	ax.view_init(elev=177,azim=-96)
	for i in range(len(balanced_data[0][1])):
		if balanced_data[0][1][i] == 6:
			six = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='y', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 7:
			seven = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='g', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 8:
			eight = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='b', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 9:
			nine = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='r', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 10:
			ten = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='m', label=balanced_data[0][1][i])
	plt.legend((six, seven, eight, nine, ten),
		('6', '7', '8','9','10'),
		scatterpoints=1,
		loc='upper right',
		ncol=1,
		fontsize=10)
	# PCA + LDA
	pca = PCA(n_components = 10)
	x_pca = pca.fit_transform(balanced_data[0][0])
	lda = LinearDiscriminantAnalysis(n_components = 3)
	x_lda = lda.fit_transform(x_pca, balanced_data[0][1])
	ax = fig.add_subplot(122, projection='3d')
	plt.title("PCA & LDA\n\n")
	ax.view_init(elev=10,azim=-112)
	for i in range(len(balanced_data[0][1])):
		if balanced_data[0][1][i] == 6:
			six = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='y', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 7:
			seven = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='g', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 8:
			eight = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='b', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 9:
			nine = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='r', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 10:
			ten = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='m', label=balanced_data[0][1][i])
	plt.legend((six, seven, eight, nine, ten),
				('6', '7', '8','9','10'),
				scatterpoints=1,
				loc='upper right',
				ncol=1,
				fontsize=10)
	#plt.show()
	return results
Beispiel #58
0
from sklearn.datasets import load_iris
import algorithm_1
import numpy as np
from sklearn.feature_selection import VarianceThreshold

if __name__ == '__main__':
    from datetime import datetime
    startTime = datetime.now()
    data = load_iris()
    X = data.data
    y = data.target

    for kNumOfNeighbors in range(15, 16):
        weightMat = algorithm_1.buildWeightMat(X, [], kNumOfNeighbors)
        #        print datetime.now() - startTime
        diagleMat = algorithm_1.buildDiagleMat(weightMat)
        laplaMat = algorithm_1.buildLaplacianMat(weightMat, diagleMat)

        featureRowMat = np.transpose(X)
        listLaplacianScore = [
            algorithm_1.computeLaplacianScore(np.transpose([featureVec]),
                                              laplaMat, diagleMat)
            for featureVec in featureRowMat
        ]
        algorithm_1.saveSortedLaplaFeatureIndexes(listLaplacianScore,
                                                  filename=("aaa"))

    #variance
    sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
    new_X = sel.fit_transform(X)
Beispiel #59
0
def variance(obs, num_obs):
    selector = VarianceThreshold()
    selector.fit_transform(data.transpose())
    indices = np.argsort(selector.variances_)[-num_obs:]
    return indices
            fmri_data, df_data = utils.groupby_average(fmri_data,
                                                       df_data.reset_index(),
                                                       groupby=['id'])
            df_data = df_data.reset_index()

        # something we need for defining the cross validation method
        BOLD = fmri_data.copy()
        targets = np.array(
            [label_map[item] for item in df_data['targets'].values])
        groups = df_data['words'].values

        # to remove the low variant voxels and standardize the BOLD signal
        from sklearn.feature_selection import VarianceThreshold
        from sklearn.preprocessing import StandardScaler
        variance_threshold = VarianceThreshold()
        BOLD = variance_threshold.fit_transform(BOLD)
        scaler = StandardScaler()
        BOLD = scaler.fit_transform(BOLD)

        # word embedding
        # convert the words into embedding features
        for word2vec_vec, word2vec_name in zip(word2vec_vecs, word2vec_names):
            csv_filename = os.path.join(
                saving_dir,
                '{} {} {} {} {} {}.csv'.format(experiment, here, sub_name,
                                               roi_name, condition,
                                               word2vec_name))
            processed = glob(os.path.join(saving_dir, '*.csv'))
            if csv_filename in processed:  # don't repeat what have done
                print(csv_filename)
                pass