Beispiel #1
0
def SelectFwe_selector(data, target, sf):
    selector = SelectFwe(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
Beispiel #2
0
dataset = pd.read_csv('regressionDataSet.csv')
x = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

#feature selector 1
from sklearn.feature_selection import SelectKBest
fs1 = SelectKBest(k=5)
x_new1 = fs1.fit_transform(x, y)

#feature selector 2
from sklearn.feature_selection import SelectFdr
fs2 = SelectFdr()
x_new2 = fs2.fit_transform(x, y)

#feature selector 3
from sklearn.linear_model import LinearRegression
estimator = LinearRegression()
from sklearn.feature_selection import RFE
fs3 = RFE(estimator, 5)
x_new3 = fs3.fit_transform(x, y)

#feature selector 4
from sklearn.feature_selection import SelectFromModel
fs4 = SelectFromModel(estimator)
x_new4 = fs4.fit_transform(x, y)

#feature selector 5
from sklearn.feature_selection import SelectFwe
fs5 = SelectFwe()
x_new5 = fs5.fit_transform(x, y)
Beispiel #3
0
#splitting training and test set
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=0)

#Chi-Squared Analysis
sel = SelectPercentile(chi2, percentile=80)
sel.fit(x_train, y_train)
x_train = sel.transform(x_train)
x_test = sel.transform(x_test)

#Univariate Feature Selection
fs = SelectFwe(alpha=150.0)
x_train = fs.fit_transform(x_train, y_train)
x_test = fs.transform(x_test)

#Classifier Fitting
clf = svm.LinearSVC(C=10,
                    penalty='l2',
                    loss='l1',
                    dual=True,
                    fit_intercept=False,
                    class_weight='auto')
clf.fit(x_train, y_train)

###############################################
'''Printed Data Analysis'''
###############################################
Beispiel #4
0
def preprocess_dataset(X, y, features, exploration_results, fs_example=False):
    """ Preprocess the data according to earlier performed exploration results with found issues. These issues are based on:
     - feature types,
     - feature dimensionality,
     - missing values,
     - output imbalance,
     - irrelevant features,
     - normalisation,
     - multicollinearity

    Since feature selection can be very dataset specific, it can also be removed from the preprocessing list.

    :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples
    :param y: A numpy array of the output. The length of the array should correspond to the size of the first
    axis of X
    :param features: A numpy array of the feature names. The length of the array should correspond to the size of the
    second axis of X
    :param exploration_results: A dict with the results of the earlier exploration, corresponding to the aforementioned
    issues
    :param fs_example: Whether also an example of feature selection should be done. Default: False
    :return: The preprocessed X, y and features
    """

    # Test the input to be according to the standards
    robustness_methods.check_input_arrays(X, y, features)

    # First change data for missing values
    if exploration_results['mv']:
        print("\nStarting missing value handling...")
        old_features = np.copy(features)
        if exploration_results['cca']:
            X, y = LDM.cca(X, y, missing_values='')
        elif exploration_results['aca']:
            X, features = LDM.aca(X, features, missing_values='')
        else:
            X, features = LDM.aca(X,
                                  features,
                                  missing_values='',
                                  removal_fraction=0.15)

            X = impute.mean_imputation(X, missing_values='')

        removed_features = _return_removed_features(features, old_features)

        print(
            "These features are removed due to having too many missing values: %s"
            % removed_features)

    if exploration_results['irrelevance'] > 0:
        print("\nRemoving irrelevant features...")
        # Remove irrelevant
        irr_feat_loc = exploration_results['irrelevant_features']
        X = np.delete(X, irr_feat_loc, axis=1)
        old_features = np.copy(features)
        features = np.delete(features, irr_feat_loc)
        removed_features = _return_removed_features(features, old_features)

        print("These features are removed due to having no information: %s" %
              removed_features)

        _return_removed_features(features, old_features)

    if exploration_results['norm_means'] or exploration_results['norm_stdev']:
        print("\nNormalising numeric features...")
        # Normalise or standardise values
        NS.normalise_numeric_features(X, exploration_results['stand'],
                                      exploration_results['norm_means'],
                                      exploration_results['norm_stdev'])

    # Than change categorical to numeric values
    if exploration_results['cat']:
        print("\nHot encoding categorical values...")
        X, features = HE.hot_encode_categorical_features(X, features)

    if exploration_results['fs'] and fs_example:
        print("\nDoing an example of feature selection...")
        # Feature selection if multicollinearity
        if exploration_results['mc']:
            # Remove multicollinearity
            feature_selector = WM.ForwardSelector(threshold=0.0001)

            # Order to have more relevant features first
            feature_orderer = OM.FeatureOrderer(f_classif)
            X = feature_orderer.fit_transform(X, y)
            features = features[np.argsort(-feature_orderer.scores_)]
        else:
            feature_selector = SF(f_classif, alpha=0.05)

        # Transform data to feature_selection
        X = feature_selector.fit_transform(X, y)
        old_features = np.copy(features)
        features = features[feature_selector.get_support()]

        # Remove extra features as only 200 are needed.
        if features.shape[0] > 200:
            print(
                "Extra feature selection is done to reduce the number of features to 200..."
            )
            extra_feature_selector = SelectKBest(f_classif, k=200)
            X = extra_feature_selector.fit_transform(X, y)
            features = features[feature_selector.get_support()]

        removed_features = _return_removed_features(features, old_features)

        print("These features are removed due to feature selection: %s" %
              removed_features)

    return X, y, features
Beispiel #5
0
fs = SelectFwe(alpha=700.0)
print "Before", x_train.shape

clf = svm.LinearSVC(C=100, penalty="l2", dual=False)
clf.fit(x_train, y_train)

print "NO FEATURE SELECTION"
print "Training Accuracy"
print clf.decision_function(x_train)
print (classification_report(y_train, clf.predict(x_train), target_names=target_names))

print "Testing Accuracy"
print (classification_report(y_test, clf.predict(x_test), target_names=target_names))


x_train = fs.fit_transform(x_train, y_train)

print "After", x_train.shape

clf.fit(x_train, y_train)
"""
w = clf.coef_
print w
a = np.array(w[0].todense(), dtype=np.float)
b = np.array(w[1].todense(), dtype=np.float)
c = -100*a/b
print a, b, c
xx = np.linspace(-5, 5)
yy = c * xx - clf.intercept_[0] / b

Beispiel #6
0
def run():

    target_names = ["Self", "Another Person", "General Statement"]
    tweets_and_labels = parse_labeled_data(filename)

    #splitting training and test set
    y_train, x_test, x_train = get_x_y(tweets_and_labels, testdata)

    #Chi-Squared Analysis
    sel = SelectPercentile(chi2, percentile=80)
    sel.fit(x_train, y_train)
    x_train = sel.transform(x_train)
    x_test = sel.transform(x_test)

    #Univariate Feature Selection
    fs = SelectFwe(alpha=150.0)
    x_train = fs.fit_transform(x_train, y_train)
    x_test = fs.transform(x_test)

    #Classifier Fitting
    clf = svm.LinearSVC(C=10,
                        penalty='l2',
                        loss='l1',
                        dual=True,
                        fit_intercept=False,
                        class_weight='auto')
    clf.fit(x_train, y_train)

    returned = clf.predict(x_test)
    print returned
    #Print relevant usernames & tweets to .csv file
    t = time.strftime("%d_%m_%Y")
    output1 = 'classifications/' + t + '_self.csv'
    output2 = 'classifications/' + t + '_another_person.csv'
    with open(output1, 'w+') as o1:
        wr = csv.writer(o1, quoting=csv.QUOTE_ALL)
        for i, val in enumerate(returned):
            if val == 0:
                row = [testdata[i][1], testdata[i][0]]
                wr.writerow(row)

    with open(output2, 'w+') as o2:
        wr = csv.writer(o2, quoting=csv.QUOTE_ALL)
        for i, val in enumerate(returned):
            if val == 1:
                row = [testdata[i][1], testdata[i][0]]
                wr.writerow(row)

    ########################################################################
    '''Graphing of Data'''
    '''Note, since there is no annotation for test data'''
    '''This is a visual representation of output data, not model accuracy'''
    ########################################################################

    graph = True
    if (graph):
        #Graph setup
        X, Y, Z, new_y = graph_setup(clf, x_test, returned)
        #graph Scatter Plot of training data
        graph_scatter(x_train, y_train)
        #Graph 3D Plot of test data
        graph_3d(X, Y, Z, new_y)
        #Graph 2-D Plot of test data
        graph_2d(X, Y, new_y)