def getDataForML(df, features, feature_predict, sampling, sample_size=100000): if sampling: rows = np.random.choice(df.index.values, sample_size) df = df.ix[rows] # create dataframes print 'samples', df.shape[0] print 'feature dimension ', df.shape[1] y, X = dmatrices(cf.selectFeatures(features, feature_predict), df, return_type="dataframe") print y # flatten y into a 1-D array y = np.ravel(y) # balance of the classes print 'mean value of the predicted variable', y.mean() return y, X
def doBestFeatureSelection(clf, numFeatures): multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Labeled.csv') multTraining, multTesting = do.partionData(multDf, .8) bestFeatures = fs.getBestFeaturesForHigherOrderTerms(clf, multTraining, numFeatures, 'accuracy') #bestFeatures = list(['alcohol', 'volatile acidity*total sulfur dioxide*density*', 'volatile acidity*chlorides*free sulfur dioxide*pH*', 'fixed acidity*volatile acidity*free sulfur dioxide*pH*sulphates*']) print(bestFeatures) trainingData = multTraining.loc[:, bestFeatures] trainingY = multTraining['label'] trainingData.insert(loc = len(trainingData.columns),column='label', value=trainingY) testingData = multTesting.loc[:, bestFeatures] testingY = multTesting['label'] testingData.insert(loc = len(testingData.columns),column='label', value=testingY) print(testingData) do.fitTrainingData(clf, trainingData) do.testClassifier(clf, testingData, "Random Forests")
def copyDataset(dataset): return Orange.data.Table(dataset) # Compute S Threshold # ============================================================================ # boxmessage("Starting Phase 4: Feature Selection", warning) trainingSet = Orange.data.Table("step%s_trainingset.tab" % previousStep) validationSet = Orange.data.Table("step%s_validationset.tab" % previousStep) trainingSet.randomGenerator = Orange.orange.RandomGenerator(random.randint(0, 10)) logmessage("Discretized Working Dataset Loaded", success) # ============================================================================ # # Feature Selection fs = FeatureSelector() if progress: fs.load() else: fs.computeThreshold(trainingSet) fs.setThreshold(S) fs.save() selectedtrainingSet = fs.select(trainingSet) selectedtrainingSet.save("step4_trainingset.tab") logmessage("New training dataset is %s" % len(selectedtrainingSet), info) logmessage("New training dataset features are %s" % len(selectedtrainingSet.domain), info) selectedvalidationset = fs.select(validationSet) selectedvalidationset.save("step4_validationset.tab")
def copyDataset(dataset): return Orange.data.Table(dataset) # ============================================================================ # boxmessage("Starting Phase 6: Final", warning) testSet = Orange.data.Table("finaltestset.tab") logmessage("Final Test Set loaded", info) # Discretizer ds = Discretizer(testSet, K, logging) ds.load() logmessage("Discretizer Loaded", info) # Feature Selector fs = FeatureSelector() fs.load() fs.setThreshold(S) logmessage("Feature Selector Loaded", info) # LabelEncoder le = None with open("labelencoder", "r") as in_file: le = pickle.load(in_file) logmessage("Label Encoder Loaded", info) # Model clf = joblib.load("classifier.model") logmessage("Classifier Loaded", info) # discretizedSet = ds.discretizeDataset(trainingSet)
def tuning_analysis(fs, n_feats): min_var = 99999999 min_hyp_par = {} for curr_fs_name, curr_fs in fs.iteritems(): voting_matrix = {} _res_voting = {} combs = curr_fs.keys() combs.sort() for comb in combs: voting_matrix[comb] = np.zeros([1, n_feats]) value = curr_fs[comb] # print ('hyper-params. comb. is %s'%comb) curr_var = np.var(value['ACC']) if curr_var < min_var: min_var = curr_var min_hyp_par = comb print 'Hyper-params. comb=%s has minimum variance of %s' % ( min_hyp_par, min_var) combs = curr_fs.keys() combs.sort() # voting matrix dim: [num_comb, n_feats] # voting_matrix = np.zeros([len(combs), n_feats]) print '\nApplying majority voting...' for j in xrange(0, n_feats): _competitors = {} for comb in combs: _competitors[comb] = curr_fs[comb]['ACC'][j] #getting the winner accuracy for all the combinations computed winners = [ comb for m in [max(_competitors.values())] for comb, val in _competitors.iteritems() if val == m ] for winner in winners: voting_matrix[winner][0][j] = 1 #getting the parameter with largest voting for comb in combs: _res_voting[comb] = np.sum(voting_matrix[comb][0]) _max = -9999999 best_comb = {} BS = {} for comb in combs: if _res_voting[comb] > _max: _max = _res_voting[comb] best_comb = comb print('Parameters set: ' + comb.__str__() + ' got votes: ' + _res_voting[comb].__str__()) BS[fs_name] = best_comb print('\nBest parameters set found on development set for: ' + fs_name.__str__() + ' is: ' + best_comb.__str__()) return BS
'MI': { 'n_neighbors': 0 }, # the bigger is alpha the sparser is the C matrix (fewer representatives) 'EN': { 'alpha': 1, # default value is 1 }, # the bigger is alpha the sparser is the C matrix (fewer representatives) 'LASSO': { 'alpha': 1 # default value is 1 } } slb_fs = { 'LASSO': fs.FeatureSelector(name='LASSO', tp='SLB', params=params['LASSO']), 'EN': fs.FeatureSelector(name='EN', tp='SLB', params=params['EN']), 'SMBA': fs.FeatureSelector(name='SMBA', tp='SLB', params=params['SMBA']), 'RFS': fs.FeatureSelector(name='RFS', tp='SLB', params=params['RFS']), 'll_l21': fs.FeatureSelector(name='ll_l21', tp='SLB', params=params['ll_l21']), #injection not working 'ls_l21': fs.FeatureSelector(name='ls_l21', tp='SLB', params=params['ls_l21']), 'Relief': fs.FeatureSelector(name='Relief', tp='filter', params=params['Relief']), 'MRMR':
plt.show() plt.savefig("pca.pdf", format='pdf') plt.savefig("pca.png", format='png') ############################################################################### #x_train = pd.read_csv(DIR + "train.csv", index_col=0, sep=',') #principal_component_analysis(x_train) DIR = '/mnt/nb254_data/exp/exp_askubuntu/' dir_c = '/mnt/nb254_data/exp/exp_askubuntu/clustering/' filenames = {'input': DIR + "dataMLClust.csv", 'clustering': dir_c + 'data_file_for_clustering.csv', 'stats': dir_c + 'stats.csv', 'clusters': dir_c + 'clustering.csv', 'pca': dir_c + 'pca.csv', 'out': 'questions.csv'} #clusteringA(dir_c, filenames) clustering_types = ['kmeans', 'spectral', 'birch', 'dbscan', 'affinity_propagation', 'ward', 'average_linkage'] clust = initClust(exp=1319, n_clusters=50, sample_size=1929906, features_to_use=ftrs.setFeaturesToUseAll() + ['PostId'] + ['SecondsToAcceptedAnswer'], clustering_type=clustering_types[0]) #data, results = dp.getDataForClustering(filenames, clust) clusteringA(clust, dir_c, filenames)
ds.findThresholds() if progress: try: with open("discretized.tab"): trainingSet = Orange.data.Table("discretized.tab") print info("Discretized Dataset Loaded") except IOError: logmessage("IOError in loading discretized training dataset", error) else: trainingSet = ds.discretizeDataset(trainingSet) trainingSet.save("discretized.tab") # ============================================================================ # # Feature Selection fs = FeatureSelector() if progress: try: with open("featureselected.tab"): trainingSet = Orange.data.Table("featureselected.tab") print info("Features Selected Dataset Loaded") except IOError: fs.computeThreshold(trainingSet) fs.save() trainingSet = fs.select(trainingSet) trainingSet.save("featureselected.tab") print info("New training dataset is %s" % len(trainingSet)) print info("New training dataset features are %s" % len(trainingSet[0])) # Model Training
def main(): ''' LOADING ANY DATASET ''' dataset_dir = '/dataset' dataset_type = '/BIOLOGICAL' dataset_name = '/WISCONSIN' #this variable decide whether to balance or not the dataset resample = True p_step = 1 # defining directory paths for saving partial and complete result path_data_folder = dataset_dir + dataset_type + dataset_name path_data_file = path_data_folder + dataset_name variables = ['X', 'Y'] print('%d.Loading and pre-processing the data...\n' % p_step) p_step += 1 # NB: If you get an error such as: 'Please use HDF reader for matlab v7.3 files',please change the 'format variable' to 'matlab_v73' D = lr.Loader(file_path=path_data_file, format='matlab', variables=variables, name=dataset_name[1:]).getVariables(variables=variables) dataset = ds.Dataset(D['X'], D['Y']) n_classes = dataset.classes.shape[0] cls = np.unique(dataset.classes) # check if the data are already standardized, if not standardize it dataset.standardizeDataset() # re-sampling dataset num_min_cls = 9999999 print('%d.Class-sample separation...\n' % p_step) p_step += 1 if resample == True: print( '\tDataset %s before resampling w/ size: %s and number of classes: %s---> %s' % (dataset_name[1:], dataset.data.shape, n_classes, cls)) # discriminating classes of the whole dataset dataset_train = ds.Dataset(dataset.data, dataset.target) dataset_train.separateSampleClass() data, target = dataset_train.getSampleClass() for i in xrange(0, n_classes): print('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape)) if data[i].shape[0] < num_min_cls: num_min_cls = data[i].shape[0] resample = '/BALANCED' print('%d.Class balancing...' % p_step) dataset.data, dataset.target = SMOTE( kind='regular', k_neighbors=num_min_cls - 1).fit_sample(dataset.data, dataset.target) p_step += 1 else: resample = '/UNBALANCED' # shuffling data print('\tShuffling data...') dataset.shufflingDataset() print('\tDataset %s w/ size: %s and number of classes: %s---> %s' % (dataset_name[1:], dataset.data.shape, n_classes, cls)) # discriminating classes the whole dataset dataset_train = ds.Dataset(dataset.data, dataset.target) dataset_train.separateSampleClass() data, target = dataset_train.getSampleClass() for i in xrange(0, n_classes): print('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape)) # Max number of features to use max_num_feat = 300 step = 1 # max_num_feat = dataset.data.shape[1] if max_num_feat > dataset.data.shape[1]: max_num_feat = dataset.data.shape[1] alpha = 10 #regularizatio parameter (typically alpha in [2,50]) params = { 'SMBA': # the smaller is alpha the sparser is the C matrix (fewer representatives) { 'alpha': alpha, 'norm_type': 1, 'max_iter': 3000, 'thr': [10**-8], 'type_indices': 'nrmInd', 'normalize': False, 'GPU': False, 'device': 0, 'PCA': False, 'verbose': False, 'step': 1, 'affine': False, } # it's possible to add other FS methods by modifying the correct file } fs_model = fs.FeatureSelector(name='SMBA', tp='SLB', params=params['SMBA']) fs_name = 'SMBA' # CLASSIFIERS (it's possible to add other classifier methods by adding entries into this list) clf_name = [ "SVM" # "Decision Tree", # "KNN" ] model = [ SVC(kernel="linear") # DecisionTreeClassifier(max_depth=5), # KNeighborsClassifier(n_neighbors=1) ] '''Perform K-fold Cross Validation...''' k_fold = 10 #defining result folders fs_path_output = '/CSFS/FS/K_FOLD' checkFolder(path_data_folder, fs_path_output) res_path_output = '/CSFS/RESULTS/K_FOLD' checkFolder(path_data_folder, fs_path_output) all_scores = {} all_scores.update({fs_name: []}) cc_fold = 0 conf_dataset = {} X = dataset.data y = dataset.target kf = KFold(n_splits=k_fold) print( '%d.Running the Intra-Class-Specific Feature Selection and building the ensemble classifier...\n' % p_step) p_step += 1 for train_index, test_index in kf.split(X): X_train_kth, X_test_kth = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print('\tDOING %s-CROSS VALIDATION W/ TRAINING SET SIZE: %s' % (cc_fold + 1, X_train_kth.shape)) ''' For the training data in each class we find the representative features and use them as a best subset feature (in representing each class sample) to perform classification ''' csfs_res = {} for i in xrange(0, n_classes): cls_res = {'C' + str(cls[i]): {}} csfs_res.update(cls_res) kth_scores = {} for i in xrange(0, len(clf_name)): kth_scores.update({clf_name[i]: []}) # check whether the 'curr_res_fs_fold' directory exists, otherwise create it curr_res_fs_fold = path_data_folder + '/' + fs_path_output + '/' + fs_name + resample checkFolder(path_data_folder, fs_path_output + '/' + fs_name + resample) # discriminating classes for the k-th fold of the training set data_train = ds.Dataset(X_train_kth, y_train) data_train.separateSampleClass() ktrain_data, ktrain_target = data_train.getSampleClass() K_cls_ind_train = data_train.ind_class for i in xrange(0, n_classes): # print ('Train set size C' + str(i + 1) + ':', ktrain_data[i].shape) print('\tPerforming feature selection on class %d with shape %s' % (cls[i] + 1, ktrain_data[i].shape)) start_time = time.time() idx = fs_model.fit(ktrain_data[i], ktrain_target[i]) # print idx print('\tTotal Time = %s seconds\n' % (time.time() - start_time)) csfs_res['C' + str(cls[i])]['idx'] = idx csfs_res['C' + str(cls[i])]['params'] = params[fs_name] # with open(curr_res_fs_fold + '/' + str(cc_fold + 1) + '-fold' + '.pickle', 'wb') as handle: # pickle.dump(csfs_res, handle, protocol=pickle.HIGHEST_PROTOCOL) ens_class = {} # learning a classifier (ccn) for each subset of 'n_rep' feature for j in xrange(0, max_num_feat): n_rep = j + 1 # first n_rep indices for i in xrange(0, n_classes): # get subset of feature from the i-th class idx = csfs_res['C' + str(cls[i])]['idx'] # print idx[0:n_rep] X_train_fs = X_train_kth[:, idx[0:n_rep]] _clf = i_clf.Classifier(names=clf_name, classifiers=model) _clf.train(X_train_fs, y_train) csfs_res['C' + str(cls[i])]['accuracy'] = _clf.classify( X_test_kth[:, idx[0:n_rep]], y_test) DTS = classificationDecisionRule(csfs_res, cls, clf_name, y_test) for i in xrange(0, len(clf_name)): _score = DTS[clf_name[i]] # print ('Accuracy w/ %d feature: %f' % (n_rep, _score)) kth_scores[clf_name[i]].append(_score) x = np.arange(1, max_num_feat + 1) kth_results = { 'clf_name': clf_name, 'x': x, 'scores': kth_scores, } all_scores[fs_name].append(kth_results) # saving k-th dataset configuration # with open(path_data_folder + fs_path_output + '/' + str(cc_fold + 1) + '-fold_conf_dataset.pickle', # 'wb') as handle: # TODO: customize output name for recognizing FS parameters' method # pickle.dump(conf_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL) cc_fold += 1 # print all_scores print('%s.Averaging results...\n' % p_step) p_step += 1 # Averaging results on k-fold # check whether the 'curr_res_fs_fold' directory exists, otherwise create it curr_res_output_fold = path_data_folder + '/' + res_path_output + '/' + fs_name + resample checkFolder(path_data_folder, res_path_output + '/' + fs_name + resample) M = {} for i in xrange(0, len(clf_name)): M.update({clf_name[i]: np.ones([k_fold, max_num_feat]) * 0}) avg_scores = {} std_scores = {} for i in xrange(0, len(clf_name)): avg_scores.update({clf_name[i]: []}) std_scores.update({clf_name[i]: []}) # k-fold results for each classifier for k in xrange(0, k_fold): for clf in clf_name: M[clf][k, :] = all_scores[fs_name][k]['scores'][clf][:max_num_feat] for clf in clf_name: avg_scores[clf] = np.mean(M[clf], axis=0) std_scores[clf] = np.std(M[clf], axis=0) x = np.arange(1, max_num_feat + 1) results = { 'clf_name': clf_name, 'x': x, 'M': M, 'scores': avg_scores, 'std': std_scores } # print avg_scores with open(curr_res_output_fold + '/clf_results.pickle', 'wb') as handle: pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Done with %s, [%d-cross validation] ' % (dataset_name[1:], k_fold))
#x_train = pd.read_csv(DIR + "train.csv", index_col=0, sep=',') #principal_component_analysis(x_train) DIR = '/mnt/nb254_data/exp/exp_askubuntu/' dir_c = '/mnt/nb254_data/exp/exp_askubuntu/clustering/' filenames = { 'input': DIR + "dataMLClust.csv", 'clustering': dir_c + 'data_file_for_clustering.csv', 'stats': dir_c + 'stats.csv', 'clusters': dir_c + 'clustering.csv', 'pca': dir_c + 'pca.csv', 'out': 'questions.csv' } #clusteringA(dir_c, filenames) clustering_types = [ 'kmeans', 'spectral', 'birch', 'dbscan', 'affinity_propagation', 'ward', 'average_linkage' ] clust = initClust(exp=1319, n_clusters=50, sample_size=1929906, features_to_use=ftrs.setFeaturesToUseAll() + ['PostId'] + ['SecondsToAcceptedAnswer'], clustering_type=clustering_types[0]) #data, results = dp.getDataForClustering(filenames, clust) clusteringA(clust, dir_c, filenames)