cv = StratifiedKFold(y_all, n_folds=folds, shuffle=True) #cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0) #classifier = svm.SVC(kernel='linear', probability=True) #classifier = RandomForestClassifierWithCoef(RandomForestClassifier) classifier = knn(n_neighbors=3) all_indexes = [] index_list = [] y_test_report = [] y_predicted_report = [] y_proba_report = [] for i, (train, test) in enumerate(cv): ## prepare and normalize test train matrices normalized_matrix_train = cl.normalise_mean_var(all_feature_matrix[train]) normalised_matrix_test = cl.normalise_mean_var(all_feature_matrix[test]) y_predicted2 = [] #select features using rfecv only on train data #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2) rfe = RFECV(estimator=classifier, cv=5, step=2, scoring='f1') print("going to select optimal features") rfe.fit(normalized_matrix_train, y_all[train]) ranked_features = (rfe.ranking_).tolist() #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape)) index = [] for i in range(0, len(ranked_features)): if ranked_features[i] is 1: index.append(i)
#generate class labels y = np.array(cl.generate_labels(rec_name_array)) print("label array is: " + str(y)) #convert list of lists to matrix all_feature_matrix = cl.covert_array_to_matrix(all_features, len(all_features), max(global_vocab.values()) + 1) #print all_feature_matrix #print ("type of all feature matrix is: " + str(type(all_feature_matrix))) #################### SEPARATING EVALUATION DATA ######################### X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split( all_feature_matrix, y, test_size=0.2, random_state=0) X_cv_normalized_matrix = cl.normalise_mean_var(X_cv) X_eval_normalized_matrix = cl.normalise_mean_var(X_eval) ############## with normalisation ###################### # Classification normalised = " " normalized_matrix = cl.normalise_mean_var(all_feature_matrix) #print ("type of normalised_matrix is: " + str(type(normalized_matrix))) #print("normalized matrix is: ") #print normalized_matrix #rw.write_value(normalized_matrix,output_folder,"all_features_normalized.txt",'w') #feature_matrix=normalized_matrix; #feature_matrix=all_feature_matrix
#generate class labels y=np.array(cl.generate_labels(rec_name_array)) print ("label array is: " + str(y)) #convert list of lists to matrix all_feature_matrix=cl.covert_array_to_matrix(all_features,len(all_features),max(global_vocab.values())+1); #print all_feature_matrix #print ("type of all feature matrix is: " + str(type(all_feature_matrix))) #################### SEPARATING EVALUATION DATA ######################### X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) X_cv_normalized_matrix=cl.normalise_mean_var(X_cv) X_eval_normalized_matrix=cl.normalise_mean_var(X_eval) ############## with normalisation ###################### # Classification normalised=" " normalized_matrix=cl.normalise_mean_var(all_feature_matrix) #print ("type of normalised_matrix is: " + str(type(normalized_matrix))) #print("normalized matrix is: ") #print normalized_matrix #rw.write_value(normalized_matrix,output_folder,"all_features_normalized.txt",'w') #feature_matrix=normalized_matrix; #feature_matrix=all_feature_matrix
cv = StratifiedKFold(y_all, n_folds=folds,shuffle=True) #cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0) #classifier = svm.SVC(kernel='linear', probability=True) #classifier = RandomForestClassifierWithCoef(RandomForestClassifier) classifier=knn(n_neighbors=3) all_indexes=[] index_list=[] y_test_report=[]; y_predicted_report=[] y_proba_report=[] for i, (train, test) in enumerate(cv): ## prepare and normalize test train matrices normalized_matrix_train=cl.normalise_mean_var(all_feature_matrix[train]) normalised_matrix_test=cl.normalise_mean_var(all_feature_matrix[test]) y_predicted2=[] #select features using rfecv only on train data #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2) rfe = RFECV(estimator=classifier, cv=5,step=2, scoring='f1') print("going to select optimal features") rfe.fit(normalized_matrix_train, y_all[train]) ranked_features=(rfe.ranking_).tolist() #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape)) index=[] for i in range(0,len(ranked_features)): if ranked_features[i] is 1: index.append(i)