Exemple #1
0
cv = StratifiedKFold(y_all, n_folds=folds, shuffle=True)
#cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0)
#classifier = svm.SVC(kernel='linear', probability=True)
#classifier = RandomForestClassifierWithCoef(RandomForestClassifier)
classifier = knn(n_neighbors=3)

all_indexes = []
index_list = []

y_test_report = []
y_predicted_report = []
y_proba_report = []

for i, (train, test) in enumerate(cv):
    ## prepare and normalize test train matrices
    normalized_matrix_train = cl.normalise_mean_var(all_feature_matrix[train])
    normalised_matrix_test = cl.normalise_mean_var(all_feature_matrix[test])

    y_predicted2 = []

    #select features using rfecv only on train data
    #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2)
    rfe = RFECV(estimator=classifier, cv=5, step=2, scoring='f1')
    print("going to select optimal features")
    rfe.fit(normalized_matrix_train, y_all[train])
    ranked_features = (rfe.ranking_).tolist()
    #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape))
    index = []
    for i in range(0, len(ranked_features)):
        if ranked_features[i] is 1:
            index.append(i)
Exemple #2
0
#generate class labels
y = np.array(cl.generate_labels(rec_name_array))
print("label array is: " + str(y))

#convert list of lists to matrix
all_feature_matrix = cl.covert_array_to_matrix(all_features, len(all_features),
                                               max(global_vocab.values()) + 1)

#print all_feature_matrix
#print ("type of all feature matrix  is: " + str(type(all_feature_matrix)))

#################### SEPARATING EVALUATION DATA #########################
X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(
    all_feature_matrix, y, test_size=0.2, random_state=0)

X_cv_normalized_matrix = cl.normalise_mean_var(X_cv)

X_eval_normalized_matrix = cl.normalise_mean_var(X_eval)

############## with normalisation ######################
# Classification
normalised = "  "
normalized_matrix = cl.normalise_mean_var(all_feature_matrix)
#print ("type of normalised_matrix is: " + str(type(normalized_matrix)))
#print("normalized matrix is: ")
#print normalized_matrix
#rw.write_value(normalized_matrix,output_folder,"all_features_normalized.txt",'w')

#feature_matrix=normalized_matrix;
#feature_matrix=all_feature_matrix
#generate class labels
y=np.array(cl.generate_labels(rec_name_array))
print ("label array is: " + str(y))


#convert list of lists to matrix
all_feature_matrix=cl.covert_array_to_matrix(all_features,len(all_features),max(global_vocab.values())+1);

#print all_feature_matrix
#print ("type of all feature matrix  is: " + str(type(all_feature_matrix)))


#################### SEPARATING EVALUATION DATA #########################
X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0)

X_cv_normalized_matrix=cl.normalise_mean_var(X_cv)

X_eval_normalized_matrix=cl.normalise_mean_var(X_eval)

############## with normalisation ######################
# Classification
normalised="  "
normalized_matrix=cl.normalise_mean_var(all_feature_matrix)
#print ("type of normalised_matrix is: " + str(type(normalized_matrix)))
#print("normalized matrix is: ")
#print normalized_matrix
#rw.write_value(normalized_matrix,output_folder,"all_features_normalized.txt",'w')

#feature_matrix=normalized_matrix;
#feature_matrix=all_feature_matrix
cv = StratifiedKFold(y_all, n_folds=folds,shuffle=True)
#cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0)
#classifier = svm.SVC(kernel='linear', probability=True)
#classifier = RandomForestClassifierWithCoef(RandomForestClassifier)
classifier=knn(n_neighbors=3)

all_indexes=[]
index_list=[]

y_test_report=[];
y_predicted_report=[]
y_proba_report=[]

for i, (train, test) in enumerate(cv):
    ## prepare and normalize test train matrices    
    normalized_matrix_train=cl.normalise_mean_var(all_feature_matrix[train])
    normalised_matrix_test=cl.normalise_mean_var(all_feature_matrix[test])
    
    y_predicted2=[]
    
    #select features using rfecv only on train data
    #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2)
    rfe = RFECV(estimator=classifier, cv=5,step=2, scoring='f1')
    print("going to select optimal features")
    rfe.fit(normalized_matrix_train, y_all[train])
    ranked_features=(rfe.ranking_).tolist()
    #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape))
    index=[]
    for i in range(0,len(ranked_features)):
        if ranked_features[i] is 1:
            index.append(i)