Example #1
0
def getDataForML(df, features, feature_predict, sampling, sample_size=100000):
   if sampling:
        rows = np.random.choice(df.index.values, sample_size)
        df = df.ix[rows]
   # create dataframes
   print 'samples', df.shape[0]
   print 'feature dimension ', df.shape[1]
   y, X = dmatrices(cf.selectFeatures(features, feature_predict), df, return_type="dataframe")
   print y
   # flatten y into a 1-D array
   y = np.ravel(y)
   # balance of the classes
   print 'mean value of the predicted variable', y.mean()
   return y, X
Example #2
0
def getDataForML(df, features, feature_predict, sampling, sample_size=100000):
    if sampling:
        rows = np.random.choice(df.index.values, sample_size)
        df = df.ix[rows]
    # create dataframes
    print 'samples', df.shape[0]
    print 'feature dimension ', df.shape[1]
    y, X = dmatrices(cf.selectFeatures(features, feature_predict),
                     df,
                     return_type="dataframe")
    print y
    # flatten y into a 1-D array
    y = np.ravel(y)
    # balance of the classes
    print 'mean value of the predicted variable', y.mean()
    return y, X
Example #3
0
def doBestFeatureSelection(clf, numFeatures):
    multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Labeled.csv')
    multTraining, multTesting = do.partionData(multDf, .8)
    bestFeatures = fs.getBestFeaturesForHigherOrderTerms(clf, multTraining, numFeatures, 'accuracy')
    #bestFeatures = list(['alcohol', 'volatile acidity*total sulfur dioxide*density*', 'volatile acidity*chlorides*free sulfur dioxide*pH*', 'fixed acidity*volatile acidity*free sulfur dioxide*pH*sulphates*'])
    print(bestFeatures)

    trainingData = multTraining.loc[:, bestFeatures]
    trainingY = multTraining['label']
    trainingData.insert(loc = len(trainingData.columns),column='label', value=trainingY)

    testingData = multTesting.loc[:, bestFeatures]
    testingY = multTesting['label']
    testingData.insert(loc = len(testingData.columns),column='label', value=testingY)
    print(testingData)
    do.fitTrainingData(clf, trainingData)
    do.testClassifier(clf, testingData, "Random Forests")
Example #4
0
def copyDataset(dataset):
	return Orange.data.Table(dataset)


# Compute S Threshold

# ============================================================================ #
boxmessage("Starting Phase 4: Feature Selection", warning)
trainingSet = Orange.data.Table("step%s_trainingset.tab" % previousStep)
validationSet = Orange.data.Table("step%s_validationset.tab" % previousStep)
trainingSet.randomGenerator = Orange.orange.RandomGenerator(random.randint(0, 10))
logmessage("Discretized Working Dataset Loaded", success)

# ============================================================================ #
# Feature Selection
fs = FeatureSelector()
if progress:
	fs.load()
else:
	fs.computeThreshold(trainingSet)
fs.setThreshold(S)

fs.save()
selectedtrainingSet = fs.select(trainingSet)
selectedtrainingSet.save("step4_trainingset.tab")

logmessage("New training dataset is %s" % len(selectedtrainingSet), info)
logmessage("New training dataset features are %s" % len(selectedtrainingSet.domain), info)

selectedvalidationset = fs.select(validationSet)
selectedvalidationset.save("step4_validationset.tab")
Example #5
0
def copyDataset(dataset):
    return Orange.data.Table(dataset)


# ============================================================================ #
boxmessage("Starting Phase 6: Final", warning)
testSet = Orange.data.Table("finaltestset.tab")
logmessage("Final Test Set loaded", info)

# Discretizer
ds = Discretizer(testSet, K, logging)
ds.load()
logmessage("Discretizer Loaded", info)

# Feature Selector
fs = FeatureSelector()
fs.load()
fs.setThreshold(S)
logmessage("Feature Selector Loaded", info)

# LabelEncoder
le = None
with open("labelencoder", "r") as in_file:
    le = pickle.load(in_file)
logmessage("Label Encoder Loaded", info)

# Model
clf = joblib.load("classifier.model")
logmessage("Classifier Loaded", info)

# discretizedSet = ds.discretizeDataset(trainingSet)
def tuning_analysis(fs, n_feats):

    min_var = 99999999
    min_hyp_par = {}

    for curr_fs_name, curr_fs in fs.iteritems():

        voting_matrix = {}
        _res_voting = {}

        combs = curr_fs.keys()
        combs.sort()

        for comb in combs:
            voting_matrix[comb] = np.zeros([1, n_feats])
            value = curr_fs[comb]
            # print ('hyper-params. comb. is %s'%comb)
            curr_var = np.var(value['ACC'])
            if curr_var < min_var:
                min_var = curr_var
                min_hyp_par = comb

        print 'Hyper-params. comb=%s has minimum variance of %s' % (
            min_hyp_par, min_var)

        combs = curr_fs.keys()
        combs.sort()

        # voting matrix dim: [num_comb, n_feats]
        # voting_matrix = np.zeros([len(combs), n_feats])
        print '\nApplying majority voting...'
        for j in xrange(0, n_feats):
            _competitors = {}
            for comb in combs:
                _competitors[comb] = curr_fs[comb]['ACC'][j]

            #getting the winner accuracy for all the combinations computed
            winners = [
                comb for m in [max(_competitors.values())]
                for comb, val in _competitors.iteritems() if val == m
            ]
            for winner in winners:
                voting_matrix[winner][0][j] = 1

        #getting the parameter with largest voting
        for comb in combs:
            _res_voting[comb] = np.sum(voting_matrix[comb][0])

        _max = -9999999
        best_comb = {}
        BS = {}
        for comb in combs:
            if _res_voting[comb] > _max:
                _max = _res_voting[comb]
                best_comb = comb
            print('Parameters set: ' + comb.__str__() + ' got votes: ' +
                  _res_voting[comb].__str__())

        BS[fs_name] = best_comb

        print('\nBest parameters set found on development set for: ' +
              fs_name.__str__() + ' is: ' + best_comb.__str__())

    return BS
        'MI': {
            'n_neighbors': 0
        },
        # the bigger is alpha the sparser is the C matrix (fewer representatives)
        'EN': {
            'alpha': 1,  # default value is 1
        },
        # the bigger is alpha the sparser is the C matrix (fewer representatives)
        'LASSO': {
            'alpha': 1  # default value is 1
        }
    }

    slb_fs = {
        'LASSO':
        fs.FeatureSelector(name='LASSO', tp='SLB', params=params['LASSO']),
        'EN':
        fs.FeatureSelector(name='EN', tp='SLB', params=params['EN']),
        'SMBA':
        fs.FeatureSelector(name='SMBA', tp='SLB', params=params['SMBA']),
        'RFS':
        fs.FeatureSelector(name='RFS', tp='SLB', params=params['RFS']),
        'll_l21':
        fs.FeatureSelector(name='ll_l21', tp='SLB',
                           params=params['ll_l21']),  #injection not working
        'ls_l21':
        fs.FeatureSelector(name='ls_l21', tp='SLB', params=params['ls_l21']),
        'Relief':
        fs.FeatureSelector(name='Relief', tp='filter',
                           params=params['Relief']),
        'MRMR':
Example #8
0
    plt.show()

    plt.savefig("pca.pdf", format='pdf')
    plt.savefig("pca.png", format='png')
###############################################################################

#x_train = pd.read_csv(DIR + "train.csv", index_col=0, sep=',')
#principal_component_analysis(x_train)

DIR = '/mnt/nb254_data/exp/exp_askubuntu/'
dir_c = '/mnt/nb254_data/exp/exp_askubuntu/clustering/'

filenames = {'input': DIR + "dataMLClust.csv",
            'clustering': dir_c + 'data_file_for_clustering.csv',
            'stats': dir_c + 'stats.csv',
            'clusters': dir_c + 'clustering.csv',
            'pca': dir_c + 'pca.csv',
            'out': 'questions.csv'}

#clusteringA(dir_c, filenames)
clustering_types = ['kmeans', 'spectral', 'birch', 'dbscan', 'affinity_propagation', 'ward', 'average_linkage']
clust = initClust(exp=1319,
                  n_clusters=50,
                  sample_size=1929906,
                  features_to_use=ftrs.setFeaturesToUseAll() + ['PostId'] + ['SecondsToAcceptedAnswer'],
                  clustering_type=clustering_types[0])

#data, results = dp.getDataForClustering(filenames, clust)

clusteringA(clust, dir_c, filenames)
Example #9
0
    ds.findThresholds()

if progress:
    try:
        with open("discretized.tab"):
            trainingSet = Orange.data.Table("discretized.tab")
            print info("Discretized Dataset Loaded")
    except IOError:
        logmessage("IOError in loading discretized training dataset", error)
else:
    trainingSet = ds.discretizeDataset(trainingSet)
    trainingSet.save("discretized.tab")

# ============================================================================ #
# Feature Selection
fs = FeatureSelector()
if progress:
    try:
        with open("featureselected.tab"):
            trainingSet = Orange.data.Table("featureselected.tab")
            print info("Features Selected Dataset Loaded")
    except IOError:
        fs.computeThreshold(trainingSet)
        fs.save()
        trainingSet = fs.select(trainingSet)
        trainingSet.save("featureselected.tab")

print info("New training dataset is %s" % len(trainingSet))
print info("New training dataset features are %s" % len(trainingSet[0]))

# Model Training
Example #10
0
def main():
    ''' LOADING ANY DATASET '''
    dataset_dir = '/dataset'
    dataset_type = '/BIOLOGICAL'
    dataset_name = '/WISCONSIN'

    #this variable decide whether to balance or not the dataset
    resample = True
    p_step = 1

    # defining directory paths for saving partial and complete result
    path_data_folder = dataset_dir + dataset_type + dataset_name
    path_data_file = path_data_folder + dataset_name
    variables = ['X', 'Y']

    print('%d.Loading and pre-processing the data...\n' % p_step)
    p_step += 1
    # NB: If you get an error such as: 'Please use HDF reader for matlab v7.3 files',please change the 'format variable' to 'matlab_v73'
    D = lr.Loader(file_path=path_data_file,
                  format='matlab',
                  variables=variables,
                  name=dataset_name[1:]).getVariables(variables=variables)

    dataset = ds.Dataset(D['X'], D['Y'])

    n_classes = dataset.classes.shape[0]
    cls = np.unique(dataset.classes)

    # check if the data are already standardized, if not standardize it
    dataset.standardizeDataset()

    # re-sampling dataset
    num_min_cls = 9999999
    print('%d.Class-sample separation...\n' % p_step)
    p_step += 1
    if resample == True:

        print(
            '\tDataset %s before resampling w/ size: %s and number of classes: %s---> %s'
            % (dataset_name[1:], dataset.data.shape, n_classes, cls))

        # discriminating classes of the whole dataset
        dataset_train = ds.Dataset(dataset.data, dataset.target)
        dataset_train.separateSampleClass()
        data, target = dataset_train.getSampleClass()

        for i in xrange(0, n_classes):
            print('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape))
            if data[i].shape[0] < num_min_cls:
                num_min_cls = data[i].shape[0]

        resample = '/BALANCED'
        print('%d.Class balancing...' % p_step)
        dataset.data, dataset.target = SMOTE(
            kind='regular',
            k_neighbors=num_min_cls - 1).fit_sample(dataset.data,
                                                    dataset.target)
        p_step += 1
    else:
        resample = '/UNBALANCED'

    # shuffling data
    print('\tShuffling data...')
    dataset.shufflingDataset()

    print('\tDataset %s w/ size: %s and number of classes: %s---> %s' %
          (dataset_name[1:], dataset.data.shape, n_classes, cls))

    # discriminating classes the whole dataset
    dataset_train = ds.Dataset(dataset.data, dataset.target)
    dataset_train.separateSampleClass()
    data, target = dataset_train.getSampleClass()

    for i in xrange(0, n_classes):
        print('\t\t#sample for class C%s: %s' % (i + 1, data[i].shape))

    # Max number of features to use
    max_num_feat = 300
    step = 1
    # max_num_feat = dataset.data.shape[1]

    if max_num_feat > dataset.data.shape[1]:
        max_num_feat = dataset.data.shape[1]

    alpha = 10  #regularizatio parameter (typically alpha in [2,50])

    params = {
        'SMBA':
        # the smaller is alpha the sparser is the C matrix (fewer representatives)
        {
            'alpha': alpha,
            'norm_type': 1,
            'max_iter': 3000,
            'thr': [10**-8],
            'type_indices': 'nrmInd',
            'normalize': False,
            'GPU': False,
            'device': 0,
            'PCA': False,
            'verbose': False,
            'step': 1,
            'affine': False,
        }
        # it's possible to add other FS methods by modifying the correct file
    }

    fs_model = fs.FeatureSelector(name='SMBA', tp='SLB', params=params['SMBA'])
    fs_name = 'SMBA'

    # CLASSIFIERS (it's possible to add other classifier methods by adding entries into this list)
    clf_name = [
        "SVM"
        # "Decision Tree",
        # "KNN"
    ]
    model = [
        SVC(kernel="linear")
        # DecisionTreeClassifier(max_depth=5),
        # KNeighborsClassifier(n_neighbors=1)
    ]
    '''Perform K-fold Cross Validation...'''
    k_fold = 10

    #defining result folders
    fs_path_output = '/CSFS/FS/K_FOLD'
    checkFolder(path_data_folder, fs_path_output)

    res_path_output = '/CSFS/RESULTS/K_FOLD'
    checkFolder(path_data_folder, fs_path_output)

    all_scores = {}
    all_scores.update({fs_name: []})

    cc_fold = 0
    conf_dataset = {}

    X = dataset.data
    y = dataset.target
    kf = KFold(n_splits=k_fold)

    print(
        '%d.Running the Intra-Class-Specific Feature Selection and building the ensemble classifier...\n'
        % p_step)
    p_step += 1
    for train_index, test_index in kf.split(X):

        X_train_kth, X_test_kth = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        print('\tDOING %s-CROSS VALIDATION W/ TRAINING SET SIZE: %s' %
              (cc_fold + 1, X_train_kth.shape))
        ''' For the training data in each class we find the representative features and use them as a best subset feature
            (in representing each class sample) to perform classification
        '''

        csfs_res = {}

        for i in xrange(0, n_classes):
            cls_res = {'C' + str(cls[i]): {}}
            csfs_res.update(cls_res)

        kth_scores = {}
        for i in xrange(0, len(clf_name)):
            kth_scores.update({clf_name[i]: []})

        # check whether the 'curr_res_fs_fold' directory exists, otherwise create it
        curr_res_fs_fold = path_data_folder + '/' + fs_path_output + '/' + fs_name + resample
        checkFolder(path_data_folder,
                    fs_path_output + '/' + fs_name + resample)

        # discriminating classes for the k-th fold of the training set
        data_train = ds.Dataset(X_train_kth, y_train)
        data_train.separateSampleClass()
        ktrain_data, ktrain_target = data_train.getSampleClass()
        K_cls_ind_train = data_train.ind_class

        for i in xrange(0, n_classes):
            # print ('Train set size C' + str(i + 1) + ':', ktrain_data[i].shape)

            print('\tPerforming feature selection on class %d with shape %s' %
                  (cls[i] + 1, ktrain_data[i].shape))

            start_time = time.time()
            idx = fs_model.fit(ktrain_data[i], ktrain_target[i])

            # print idx

            print('\tTotal Time = %s seconds\n' % (time.time() - start_time))

            csfs_res['C' + str(cls[i])]['idx'] = idx
            csfs_res['C' + str(cls[i])]['params'] = params[fs_name]

            # with open(curr_res_fs_fold + '/' + str(cc_fold + 1) + '-fold' + '.pickle', 'wb') as handle:
            #     pickle.dump(csfs_res, handle, protocol=pickle.HIGHEST_PROTOCOL)

        ens_class = {}
        # learning a classifier (ccn) for each subset of 'n_rep' feature
        for j in xrange(0, max_num_feat):
            n_rep = j + 1  # first n_rep indices

            for i in xrange(0, n_classes):
                # get subset of feature from the i-th class
                idx = csfs_res['C' + str(cls[i])]['idx']

                # print idx[0:n_rep]

                X_train_fs = X_train_kth[:, idx[0:n_rep]]

                _clf = i_clf.Classifier(names=clf_name, classifiers=model)
                _clf.train(X_train_fs, y_train)

                csfs_res['C' + str(cls[i])]['accuracy'] = _clf.classify(
                    X_test_kth[:, idx[0:n_rep]], y_test)

            DTS = classificationDecisionRule(csfs_res, cls, clf_name, y_test)

            for i in xrange(0, len(clf_name)):
                _score = DTS[clf_name[i]]
                # print ('Accuracy w/ %d feature: %f' % (n_rep, _score))
                kth_scores[clf_name[i]].append(_score)

        x = np.arange(1, max_num_feat + 1)

        kth_results = {
            'clf_name': clf_name,
            'x': x,
            'scores': kth_scores,
        }

        all_scores[fs_name].append(kth_results)

        # saving k-th dataset configuration
        # with open(path_data_folder + fs_path_output + '/' + str(cc_fold + 1) + '-fold_conf_dataset.pickle',
        #           'wb') as handle:  # TODO: customize output name for recognizing FS parameters' method
        #     pickle.dump(conf_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

        cc_fold += 1

    # print all_scores

    print('%s.Averaging results...\n' % p_step)
    p_step += 1
    # Averaging results on k-fold

    # check whether the 'curr_res_fs_fold' directory exists, otherwise create it
    curr_res_output_fold = path_data_folder + '/' + res_path_output + '/' + fs_name + resample
    checkFolder(path_data_folder, res_path_output + '/' + fs_name + resample)

    M = {}
    for i in xrange(0, len(clf_name)):
        M.update({clf_name[i]: np.ones([k_fold, max_num_feat]) * 0})

    avg_scores = {}
    std_scores = {}
    for i in xrange(0, len(clf_name)):
        avg_scores.update({clf_name[i]: []})
        std_scores.update({clf_name[i]: []})

    # k-fold results for each classifier
    for k in xrange(0, k_fold):
        for clf in clf_name:
            M[clf][k, :] = all_scores[fs_name][k]['scores'][clf][:max_num_feat]

    for clf in clf_name:
        avg_scores[clf] = np.mean(M[clf], axis=0)
        std_scores[clf] = np.std(M[clf], axis=0)

    x = np.arange(1, max_num_feat + 1)
    results = {
        'clf_name': clf_name,
        'x': x,
        'M': M,
        'scores': avg_scores,
        'std': std_scores
    }

    # print avg_scores

    with open(curr_res_output_fold + '/clf_results.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Done with %s, [%d-cross validation] ' % (dataset_name[1:], k_fold))
Example #11
0
#x_train = pd.read_csv(DIR + "train.csv", index_col=0, sep=',')
#principal_component_analysis(x_train)

DIR = '/mnt/nb254_data/exp/exp_askubuntu/'
dir_c = '/mnt/nb254_data/exp/exp_askubuntu/clustering/'

filenames = {
    'input': DIR + "dataMLClust.csv",
    'clustering': dir_c + 'data_file_for_clustering.csv',
    'stats': dir_c + 'stats.csv',
    'clusters': dir_c + 'clustering.csv',
    'pca': dir_c + 'pca.csv',
    'out': 'questions.csv'
}

#clusteringA(dir_c, filenames)
clustering_types = [
    'kmeans', 'spectral', 'birch', 'dbscan', 'affinity_propagation', 'ward',
    'average_linkage'
]
clust = initClust(exp=1319,
                  n_clusters=50,
                  sample_size=1929906,
                  features_to_use=ftrs.setFeaturesToUseAll() + ['PostId'] +
                  ['SecondsToAcceptedAnswer'],
                  clustering_type=clustering_types[0])

#data, results = dp.getDataForClustering(filenames, clust)

clusteringA(clust, dir_c, filenames)