Beispiel #1
0
def FeatureImportance(X_train, X_test, y_train, y_test, n):
    modeltrain = ExtraTreesClassifier(n_estimators=n)
    #modeltest = ExtraTreesClassifier(n_estimators = n)
    modeltrain.fit(X_train, y_train)
    #modeltest.fit(X_test, y_test)
    return modeltrain.fit_transform(
        X_train, y_train), modeltrain.fit_transform(
            X_test, y_test), modeltrain.feature_importances_
 def getSelectedValues(self):
     (train, trainLabels, test) = self.getScaledValues()
     
     selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
     train = selector.fit_transform(train, trainLabels)
     
     return (train, trainLabels, test)
     test = selector.transform(test)
Beispiel #3
0
def selecao_feature(X, y, resp1):
    print(
        '\n********************************************************************'
    )
    print('Shape Entrada: ', X.shape)
    if resp1 == 1:
        clf = ExtraTreesClassifier(n_estimators=100).fit(X, y)
        model = SelectFromModel(clf, prefit=True)
        X_new = model.transform(X)
        print('Extra Trees - New Shape: ', X_new.shape)
        nomeFeature = 'Extra Trees'
    elif resp1 == 2:
        clf = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
        model = SelectFromModel(clf, prefit=True)
        X_new = model.transform(X)
        print('LinearSVC - New Shape: ', X_new.shape)
        nomeFeature = 'LinearSVC'
    elif resp1 == 3:
        clf = VarianceThreshold(threshold=(.9 * (1 - .9)))
        X_new = clf.fit_transform(X)
        print('Variance Threshold - New Shape: ', X_new.shape)
        nomeFeature = 'Variance Threshold'
    return X_new, nomeFeature
Beispiel #4
0
def voting(peptide_predict_file,nucleotide_predict_file,effector_train,noneffector_train):

    total = 0
  
    with open(peptide_predict_file) as f:
     for line in f:
        finded = line.find('>')
        
        if finded == 0:
            total =total+ 1

    print('Total number of sequences to be classified: ',total)
    
    import time
    start_time = time.clock()
    import random
    import pandas
    import numpy as np
    import csv
    from sklearn import svm
    from sklearn.naive_bayes import BernoulliNB, MultinomialNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    from random import shuffle
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    f=random.seed()
    from sklearn.metrics import accuracy_score
    import numpy as np
    np.random.seed(123)
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation, Flatten
    from keras.layers import Convolution2D, MaxPooling2D
    from keras.utils import np_utils
    from sklearn.model_selection import cross_val_score
    from sklearn.feature_selection import VarianceThreshold
    from sklearn.preprocessing import StandardScaler
    from keras.models import Sequential
    from keras.layers import Dense
    from imblearn.over_sampling import SMOTE, ADASYN
    from collections import Counter
    from sklearn.ensemble import ExtraTreesClassifier
    import warnings
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    warnings.filterwarnings("ignore")
    
    f=random.seed()

    #getting feature vector of sequence to be predicted
    featurevector=featureextraction(peptide_predict_file, nucleotide_predict_file, total)
    print(len(featurevector))

 
    #getting training data
    dataframe = pandas.read_csv(effector_train, header=None, sep=',')
    dataset = dataframe.values
    eff = dataset[:,0:1000].astype(float)

    dataframe = pandas.read_csv(noneffector_train, header=None, sep=',')
    dataset = dataframe.values
    noneff = dataset[:,0:1000].astype(float)


    
    a1=eff.shape
    a2=noneff.shape
    X = np.ones((a1[0]+a2[0],a1[1]))
    Y = np.ones((a1[0]+a2[0],1))
    
    for i in range(a1[0]):
        for j in range(a1[1]):
            X[i][j]=eff[i][j]
        Y[i,0]=0
        #print(i)    
    for i in range(a2[0]):
        for j in range(a2[1]):
            X[i+a1[0]][j]=noneff[i][j]
        Y[i+a1[0]][0]=1
        
        
    
    warnings.filterwarnings("ignore")
    print('Resampling the unbalanced data...')
    X_resampled, Y_resampled = SMOTE(kind='borderline1').fit_sample(X, Y)
    
    #Standardize features by removing the mean and scaling to unit variance
    scaler = StandardScaler().fit(X_resampled)
    X = scaler.transform(X_resampled)


  
    #Removing features with low variance

    model = ExtraTreesClassifier()
    model.fit(X_resampled, Y_resampled)
    X_resampled=model.fit_transform(X_resampled, Y_resampled)
    featurevector=model.transform(featurevector)
    newshape=X_resampled.shape
    

    print("Training Classifiers...")
    #train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.15, random_state=f)
    y_t=y_train
    y_te=y_test
    y_train=np.ones((len(y_t),2))
    y_test=np.ones((len(y_te),2))
    for i in range(len(y_t)):
        if y_t[i]==0:
            y_train[i][1]=0
        if y_t[i]==1:
            y_train[i][0]=0
            
    for i in range(len(y_te)):
        if y_te[i]==0:
            y_test[i][1]=0
        if y_te[i]==1:
            y_test[i][0]=0    
    
    #ANN
    print("Training Artificial Neural Network...") 
    model = Sequential()
    model.add(Dense(newshape[1]+1, activation='relu', input_shape=(newshape[1],)))
    model.add(Dense(500, activation='relu'))
    #model.add(Dense(800, activation='relu'))
    #model.add(Dense(500, activation='relu'))
    model.add(Dense(250, activation='relu'))
    model.add(Dense(90, activation='relu'))
    # Add an output layer 
    model.add(Dense(2, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])
    model.fit(X_train, y_train,epochs=1000, batch_size=25, verbose=0)
    score = model.evaluate(X_test, y_test,verbose=0)
    ANN = model.predict(X_test)
    ANN = model.predict(featurevector)

    y_train=[]
    y_test=[]
    y_train=y_t
    y_test=y_te
            
    #SVM
    print("Training Support Vector Machine...") 
    clf1 = svm.SVC(decision_function_shape='ovr', kernel='linear', max_iter=1000)
    clf1.fit(X_train, y_train)
    y_pred=clf1.predict(X_test)
    results=cross_val_score(clf1, X_test, y_test, cv=10)
    SVM=clf1.predict(featurevector)

    #KNN
    print("Training k-Nearest Neighbor ...") 
    neigh = KNeighborsClassifier(n_neighbors=10)
    neigh.fit(X_train, y_train) 
    results=cross_val_score(neigh, X_test, y_test, cv=10)
    y_pred=neigh.predict(X_test)
    KNN=neigh.predict(featurevector)

    #Naive Bayes
    print("Training Naive Bayes...") 
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    results=cross_val_score(clf, X_test, y_test, cv=10)
    y_pred=clf.predict(X_test)
    DT=clf.predict(featurevector)
     
    #RandomForest
    print("Training Random Forest...") 
    rf = RandomForestClassifier(random_state=0, min_samples_leaf=100)
    rf.fit(X_train, y_train)
    results=cross_val_score(rf, X_test, y_test, cv=10)
    y_pred=rf.predict(X_test)
    RF=clf.predict(featurevector)
    
    vote_result = [[0 for x in range(2)] for y in range(len(SVM))]
    for i in range(len(ANN)):
          if round(ANN[i][0])==1.0:
              vote_result[i][0]=vote_result[i][0]+1
          if round(ANN[i][1])==1.0:
              vote_result[i][1]=vote_result[i][1]+1
          if SVM[i]==0:
              vote_result[i][0]=vote_result[i][0]+1
          if SVM[i]==1:
              vote_result[i][1]=vote_result[i][1]+1
          if KNN[i]==0:
              vote_result[i][0]=vote_result[i][0]+1
          if KNN[i]==1:
              vote_result[i][1]=vote_result[i][1]+1
          if DT[i]==0:
              vote_result[i][0]=vote_result[i][0]+1
          if DT[i]==1:
              vote_result[i][1]=vote_result[i][1]+1
          if RF[i]==0:
              vote_result[i][0]=vote_result[i][0]+1
          if RF[i]==1:
              vote_result[i][1]=vote_result[i][1]+1    

    print('-----------------------Results-----------------------')
    for i in range(len(ANN)):
        if vote_result[i][0]>=vote_result[i][1]:
            print('Sequence ',i+1,' is a probable Type 6 Effector')
        else:    
            print('Sequence ',i+1,' is not a Type 6 Effector')
    end_time = time.clock()
    print('Execution time',(end_time-start_time))
import numpy as np
from sklearn import preprocessing as pp
from sklearn import cross_validation as cv
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

workDir = r'C:\users\Akshay\Downloads\kaggle\\'

# Read data
train = np.genfromtxt(open(workDir + 'train.csv','rb'), delimiter=',')
target = np.genfromtxt(open(workDir + 'trainLabels.csv','rb'), delimiter=',')
test = np.genfromtxt(open(workDir + 'test.csv','rb'), delimiter=',')

# Scale data
train = pp.scale(train)
test = pp.scale(test)

# Select features
selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
train = selector.fit_transform(train, target)
test = selector.transform(test)

# Estimate score
classifier = SVC(C=8, gamma=0.17)
scores = cv.cross_val_score(classifier, train, target, cv=30)
print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2))

# Predict and save
result = classifier.fit(train, target).predict(test)
np.savetxt(workDir + 'a.csv', result, fmt='%d')
	return [ds, passengerIds]

if __name__ == '__main__':
	[train, trainPassengerIds] = cleanUpData('Data/train.csv')
	[test, testPassengerIds] = cleanUpData('Data/test.csv')

	# Fit the training data to the Survived labels and create the decision trees
	target = train.filter(['Survived'])
	target = np.array(target.values).ravel()
	train = train.drop(['Survived'], axis=1)
	
	# Scale data
	train = pp.scale(train)
	test = pp.scale(test)

	
	# Select features
	selector = ExtraTreesClassifier(compute_importances=True, random_state=0)
	train = selector.fit_transform(train, target)
	test = selector.transform(test)

	# Estimate score
	classifier = SVC(C=8, gamma=0.17)
	scores = cv.cross_val_score(classifier, train, target, cv=30)
	print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2))

	# Predict and save
	result = classifier.fit(train, target).predict(test)
	submissionData = {'PassengerId': testPassengerIds, 'Survived': result}
	submissionDF = pd.DataFrame(submissionData)
	submissionDF.to_csv('Data/Titanic_Preprocess_XtraTrees_SVC.csv', index=False)
import numpy as np
import pandas as pd
from sklearn import preprocessing as pp
from sklearn.ensemble import ExtraTreesClassifier

print("Preparing the data")
train = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_train_clean.csv", sep=',', header=0)
test = pd.io.parsers.read_csv(r"D:\shared\datascience\phy_test_clean.csv", sep=',', header=0)

test_index = test.Id
test = test.iloc[:,2:]

target = train.kind
train_index = train.Id
train = train.iloc[:,2:]

print("Preparing an Feature classifier")
selector = ExtraTreesClassifier(compute_importances=True, random_state=0)

print("Transforming the original dataset")
train = pd.DataFrame(selector.fit_transform(train, target), index = train_index)
test = pd.DataFrame(selector.transform(test), index = test_index)
train['kind'] = target

print("Storing the data...")
train.to_csv(r"D:\shared\datascience\phy_train.csv", sep=',')
test.to_csv(r"D:\shared\datascience\phy_test.csv", sep=',')
print("Job finished")
Beispiel #8
0
def main():
	X =[]
	Y=[]
	featuresDB = Base(os.getcwd()+"\\Databases\\features.db")
	featuresDB.open()
	print "features open"
 
	for rec in featuresDB:
		vec = []
		vec.append(rec.f1)
		vec.append(rec.f3)
		vec.append(rec.f4)
		vec.append(rec.f5)
		vec.append(rec.f6)
		vec.append(rec.f7)
		vec.append(rec.f10)
		vec.append(rec.f11)
		vec.append(rec.f12)
		vec.append(rec.f13)
		vec.append(rec.f14)
		vec.append(rec.f15)
		vec.append(rec.f16)
		vec.append(rec.f17)
		vec.append(rec.f18)
		vec.append(rec.f19)
		vec.append(rec.f20)
		vec.append(rec.f21)
		vec.append(rec.f22)
		vec.append(rec.f23)
		X.append(vec)
		Y.append(rec.score)
	print "building classifier"	

	Y = np.array(Y)
	ybar = Y.mean()
	for i in range(len(Y)):
		if Y[i]<ybar: 
			Y[i]=1
		else:
			 Y[i]=2
	scaler = Scaler().fit(X)
	X = scaler.transform(X)
	
	X= np.array(X)
	Y=np.array(Y)

	skf = cross_validation.StratifiedKFold(Y,k=2)
	for train, test in skf:
		X_train, X_test = X[train], X[test]
		y_train, y_test = Y[train], Y[test]

	
	clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True)
	scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5)
	
	clf.fit_transform(X_train,y_train)
	print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
	print clf.feature_importances_

	y_pred =clf.predict(X_test)
	print	classification_report(y_test,y_pred)
	
	model=(scaler,clf)
	joblib.dump(model,'AestheticModel\\aestheticModel.pkl')
	
	print "Done"
Beispiel #9
0
print test_data_array.shape
file_label.close()

# normalize the features in the train and test dataset
train_data_array_norm = preprocessing.scale(train_data_array)
test_data_array_norm = preprocessing.scale(test_data_array)

# run the module of PCA
#pca = PCA(n_components = 10)
#train_data_array_norm_pca = pca.fit_transform(train_data_array_norm, train_result_array)
#test_data_array_norm_pca = pca.transform(test_data_array_norm)
#print 'train data shape', train_data_array_norm_pca.shape

# tree-based feature selection
classifier = ExtraTreesClassifier()
train_data_array_norm_pca = classifier.fit_transform(train_data_array_norm, np.ravel(train_result_array))
test_data_array_norm_pca = classifier.transform(test_data_array_norm)
print 'train data shape', train_data_array_norm_pca.shape


## build SVM
# random shuffle
np.random.seed(0)
indices = np.random.permutation(len(train_result_array))

classifer = svm.SVC(C=20, gamma = 0.05)

# cross validation
scores = cv.cross_val_score(classifier, train_data_array_norm_pca, np.ravel(train_result_array), cv = 30)

Beispiel #10
0
#print(model.feature_importances_)
#(c)
nmf = decomposition.NMF(n_components=10).fit(xTrain)


def score(model, data, score=metrics.explained_variance_score):
    prediction = model.inverse_transform(model.transform(data))
    return score(data, prediction)


#print(score(nmf, xTrain))
#(d)
xTrainnormal = (transformer.transform(xTrain))
xTrainpca = pca.fit_transform(xTrain)
model = NMF(n_components=10, init='random', random_state=0)
xTrainnmf = model.fit_transform(xTrain)
xTestnormal = (transformer.transform(xTest))
xTestpca = pca.fit_transform(xTest)
xTestnmf = model.fit_transform(xTest)
clfnormal = LogisticRegression(penalty='none',
                               random_state=0,
                               solver='lbfgs',
                               max_iter=1000,
                               multi_class='multinomial').fit(
                                   xTrainnormal, yTrain)
clf.predict(xTestnormal)
ypredict1 = clf.predict_proba(xTestnormal)
clfpca = LogisticRegression(penalty='none',
                            random_state=0,
                            solver='lbfgs',
                            max_iter=1000,