Ejemplo n.º 1
0
print ""
print "Selected Feature list - before Feature_Selection", features1

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features1, sort_keys=True)
labels, features = targetFeatureSplit(data)

### We do not know yet if feature scaling and feature filering using kbest will benefit our model yet.
### But lets try it anyway

# Scale features
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# K-best features - choosing 6 features for a trial
k_best = SelectKBest(k=6)
k_best.fit(features, labels)

result_list = zip(k_best.get_support(), features1[1:], k_best.scores_)
result_list = sorted(result_list, key=lambda x: x[2], reverse=True)
#print "K-best features - i.e. top 6 features selected:", result_list
"""
OUTPUT:
K-best features - i.e. top 6 features selected: 
[(True, 'exercised_stock_options', 25.097541528735491), 
(True, 'total_stock_value', 24.467654047526391), 
(True, 'bonus', 21.060001707536578), 
(True, 'wealth', 19.457343207083316), 
(True, 'salary', 18.575703268041778), 
(True, 'fraction_to_poi', 16.641707070468989), 
(False, 'long_term_incentive', 10.072454529369448), 
Ejemplo n.º 2
0
def select_feat(data, labels, n_components):

    data = SelectKBest(chi2, k=n_components).fit_transform(data, labels)
    return data
Ejemplo n.º 3
0
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
Ejemplo n.º 4
0
    elif FeatSelection_SVM == True:
        X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print ("SVC Transformed X:",X.shape)

    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''

    KFilt=None
    # KFilt=200

    if KFilt is not None:
        k = SelectKBest(k=KFilt).fit(X,y)
        X=k.transform(X)
        featureNames=featureNames[k.get_support()]
        print("X reduced to K best features: ",X.shape)

    print("Performance as a function of percent of features used:")
    PlotPerfPercentFeatures(X,y,est=LinearSVC())

    #varFilt = VarianceThreshold(threshold=0.05)
    #X = varFilt.fit_transform(X)
    #print(X.shape,"X post low variance feature filtering")

    'EG - graph best features; feature selection using RF, ensemble classifiers..'
    'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

    RFE_FeatsToKeep = 15
Ejemplo n.º 5
0
                                   max_df=0.75,
                                   stop_words='english',
                                   tokenizer=tokenize)
#count_vectorizer = count_vectorizer.fit(twenty_train.data)
# tf-idf transformer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
##########

##########
print("SGDClassifier")
#SGDClassifier
from sklearn.linear_model import SGDClassifier
pipeline = Pipeline([
    ('vect', count_vectorizer),
    ('chi2', SelectKBest(chi2, k=1500)),
    ('tfidf', tfidf_transformer),
    ('clf',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
])

#Fit
print("Fit")
text_clf_sgd = pipeline.fit(twenty_train.data, twenty_train.target)

#Evaluation of the performance on the test set
print("Evaluation")
Ejemplo n.º 6
0
m = len(temparray)

tn = 0
fn = 0
for i in y:
    if i == 1:
        tn = tn + 1
    else:
        fn = fn + 1

print(4, tn, fn)

maxFeature = 16
timestampMaxFeature = 6
numMaxFeature = 12
model2 = SelectKBest(chi2, k=maxFeature)  #选择k个最佳特征
model2.fit_transform(temparray, y)
featureScore = model2.scores_.tolist()

fig = plt.figure(figsize=(9.6, 5.4))
# 不画折线图的原因是有大量的0
plt.bar(list(range(31)), featureScore)

ax = fig.add_subplot(1, 1, 1)

ax.xaxis.grid(True, which='major')  # x坐标轴的网格使用主刻度
ax.set_title("Cate " + str(cate) +
             " Feature Score Distribution Of Chi-square Test")
ax.set_xlabel("Feature")
ax.set_ylabel("Score")
# plt.show()
Ejemplo n.º 7
0
def get_kBest_mutual(X, y):
    return SelectKBest(score_func=mutual_info_regression, k="all").fit(X, y)
Ejemplo n.º 8
0
#drawing scatterplot on numerical data
sns.pairplot(cardial_3)


# 1st method univariate feature engg

from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

array = cardial_3.values
X = array[:,0:120]
Y = array[:,120]
# feature extraction
test = SelectKBest(score_func=chi2, k=10) # k tells how many top features we need
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
scores=fit.scores_
scores_df=pd.DataFrame(scores)

top_index=scores_df.sort_values(0,ascending = False).head(10).index
colname_uni = cardial_3.columns[[top_index]]
# features = fit.transform(X)

	
# 0	RAZRIV
# 1	S_AD_KBRIG
# 2	ROE
# 3	ZSN_A
Ejemplo n.º 9
0
target = data['PSS_Stress']
data = data.drop('PSS_Stress', 1)

# Missing Data Filtering

print(data.isnull().any(
    axis=1).sum())  # número de registos que possuem pelo menos um valor 'NaN'

data = data.fillna(data.median())  # substituir NaN por valor da mediana
# data = data.fillna(data.mean())  # substituir NaN por valor da média
# data = data.dropna()  # descartar registos que possuem NaN

# Feature selection

selector = SelectKBest(f_classif, k=5)  # teste para k = 1..10
selector.fit(data, target)
cols = selector.get_support(indices=True)
cols_names = list(data.columns[cols])

for idx, (ci, cn) in enumerate(zip(cols, cols_names)):
    print("*" * (len(cols) - idx) + " " * idx, ci, cn)

data = data[cols_names]

# Comparar resultados:

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
values_standardized = scaler.fit_transform(data.values)
data = pd.DataFrame(values_standardized, columns=data.columns)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print

print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    print "done in %fs" % (time() - t0)
    print


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# mapping from integer feature name to original token string
feature_names = np.asarray(vectorizer.get_feature_names())

Ejemplo n.º 11
0
from nltk.corpus import words

df = pd.read_csv('fake_real_tweets.csv')
print('loaded data')
y = df.label
df = df.drop('label', axis=1)
indices = df.index.get_values()
x_train, x_main_test, y_train, y_main_test, train_indices, test_indices = train_test_split(
    df['text'], y, indices, test_size=0.33, shuffle=True)

stop_words = ['http', 'https', 'twitter', 'com', 'www']

print('learning tfidf')
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
clf = SelectKBest(score_func=mutual_info_classif, k=1000)
tfidf_fit = clf.fit(tfidf_train, y_train)
tfidf_x_train_ft = tfidf_fit.transform(tfidf_train)
print('done tfidf')

print('learning count vectorizer')
count_vectorizer = CountVectorizer(stop_words=stop_words)
count_train = count_vectorizer.fit_transform(x_train)
clf = SelectKBest(score_func=mutual_info_classif, k=1000)
count_fit = clf.fit(count_train, y_train)
count_x_train_ft = count_fit.transform(count_train)
print('done count vectorizer')

print('learning mn count')
mn_count_clf = MultinomialNB()
mn_count_clf.fit(count_x_train_ft, y_train)
Ejemplo n.º 12
0
for rs in random_seeds:
    # choose a random sample of zeros (Legit Class)
    credit_data_df_legit_random = credit_data_df_legit.sample(numberOfZeros,
                                                              random_state=rs)

    # merge the above with the ones (Fraud Class) and do the rest of the pipeline with it
    result = credit_data_df_legit_random.append(credit_data_df_fraud)

    # create dataframe X, which includes variables time, amount, V1, V2, V3, V4 etc
    X = result[features]

    # create array y, which includes the classification only
    y = result['Class']

    #Select the best features Using the SelectKBest Method from sklearn
    select_kbest = SelectKBest(f_classif, k=24)
    #Fit the method onto the data and then return a transformed array
    X_new = select_kbest.fit_transform(X, y)

    # use sklearn to split the X and y, into X_train, X_test, y_train y_test with 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(X_new,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=rs,
                                                        stratify=y)

    # ------------------------------------------------------------------------------------------------------------------------------------------------------------------#
    #                                                    TRAINING ON THE TRAINING SET
    # ------------------------------------------------------------------------------------------------------------------------------------------------------------------#
    # use sklearns Logistic Regression to fit a model to train data
    clf = LogisticRegression(random_state=rs,
Ejemplo n.º 13
0
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_test['label'].value_counts())

X_DOS = newdf.drop('label', 1)
Y_DOS = newdf.label
X_DOS_test = newdf_test.drop('label', 1)
Y_DOS_test = newdf_test.label

colNames = list(X_DOS)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
np.seterr(divide='ignore', invalid='ignore')
fclass = SelectKBest(
    f_classif,
    k=111)  #iterate the k from 1 to 120. The max. accuracy comes at k=111 .
fclass.fit(X_DOS, Y_DOS)
true = fclass.get_support()
fclasscolindex_DOS = [i for i, x in enumerate(true) if x]
fclasscolname_DOS = list(colNames[i] for i in fclasscolindex_DOS)
print('Features selected :', fclasscolname_DOS)

features = newdf[fclasscolname_DOS].astype(float)
features1 = newdf_test[fclasscolname_DOS].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
t0 = time()
Ejemplo n.º 14
0
                                names_hashingfile,
                                order=-1)
    """
	Random forest consists of a number of decision trees. Every node in the decision trees is a condition on a single feature, 
	designed to split the dataset into two so that similar response values end up in the same set. 	Random forest’s impurity based 
	ranking is typically aggressive in the sense that there is a sharp drop-off of scores after the first few top ones. 
	Tree based methods can model non-linear relations well and don’t require much tuning. 
	For a forest, the impurity decrease from each feature can be averaged and the features are ranked according to this measure.
	"""
    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names_hashingfile)
    """
	Selecting the k-best features
	"""
    kbest = SelectKBest(chi2,
                        total_features)  #using all the features for analysis
    kbest.fit(X, Y)
    #print(np.abs(kbest.scores_))
    ranks["KBest"] = rank_to_dict(np.nan_to_num(np.abs(kbest.scores_)),
                                  names_hashingfile)
    """
	Another tree based classifier
	"""
    treec = ExtraTreesClassifier()
    treec.fit(X, Y)
    ranks["ExtraTrees"] = rank_to_dict(treec.feature_importances_,
                                       names_hashingfile)
    """
	With linear correlation (Lin. corr.), each feature is evaluated independently, and we measure the linear relationship 
	between each feature and the response variable.
	"""
Ejemplo n.º 15
0
print("%d samples, %d features" % (n_subjects, n_features))

### Prediction with SVR #######################################################
print("ANOVA + SVR")
### Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVR

svr = SVR(kernel='linear')

### Dimension reduction
from sklearn.feature_selection import SelectKBest, f_regression

# Here we use a classical univariate feature selection based on F-test,
# namely Anova.
feature_selection = SelectKBest(f_regression, k=2000)

# We have our predictor (SVR), our feature selection (SelectKBest), and now,
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline

anova_svr = Pipeline([('anova', feature_selection), ('svr', svr)])

### Fit and predict
anova_svr.fit(gm_maps_masked, age)
age_pred = anova_svr.predict(gm_maps_masked)

### Visualization
### Look at the SVR's discriminating weights
coef = svr.coef_
Ejemplo n.º 16
0
        data_dict[key]['from_poi_to_this_person'],
        data_dict[key]['to_messages'])
    data_dict[key]['ratio_to_poi_email'] = np.true_divide(
        data_dict[key]['from_this_person_to_poi'],
        data_dict[key]['from_messages'])

my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

#Feature Selection
trimmed_features_list = ['poi']
k = 5
select = SelectKBest(k=k)
select = select.fit(features, labels)
features = select.transform(features)
top_scores = np.sort(select.scores_)[-k:]

for i in range(len(features_list[1:])):
    if select.scores_[i] in top_scores:
        trimmed_features_list.append(features_list[1:][i])
features_list = trimmed_features_list

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
Ejemplo n.º 17
0
t0 = time()
mlp.fit(X_train_s, y_train)
print("done in %0.3fs" % (time() - t0))

y_pred = mlp.predict(X_test_s)
print(classification_report(y_test, y_pred, target_names=target_names))


# %%
# Univariate feature filtering (Anova) with Logistic-L2
# -----------------------------------------------------

anova_l2lr = Pipeline([
    ('standardscaler', preprocessing.StandardScaler()),
    ('anova', SelectKBest(f_classif)),
    ('l2lr', lm.LogisticRegression(max_iter=1000, class_weight='balanced',
                                   fit_intercept=False))
])

param_grid = {'anova__k': [50, 100, 500, 1000, 1500, X_train.shape[1]],
              'l2lr__C': 10. ** np.arange(-3, 3)}
anova_l2lr_cv = GridSearchCV(anova_l2lr, cv=5,  param_grid=param_grid,
                             n_jobs=5)

t0 = time()
anova_l2lr_cv.fit(X=X_train, y=y_train)
print("done in %0.3fs" % (time() - t0))

print("Best params found by grid search:")
print(anova_l2lr_cv.best_params_)
Ejemplo n.º 18
0
#
# One downside of this is that we are using knowledge from the dataset to select features, and thus introducing some overfitting. We could get around the overfitting in the "real world" by using a subset of the data for feature selection, and using a different subset for training the algorithm. We'll make things a bit simpler for now and skip that step.

# In[7]:

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Convert the upvotes variable to binary so it works with a chi-squared test.
col = submissions["upvotes"].copy(deep=True)
col_mean = col.mean()
col[col < col_mean] = 0
col[(col > 0) & (col > col_mean)] = 1

# Find the 1000 most informative columns
selector = SelectKBest(chi2, k=1000)
selector.fit(full_matrix, col)
top_words = selector.get_support().nonzero()

# Pick only the most informative columns in the data.
chi_matrix = full_matrix[:, top_words[0]]

# ###7: Adding meta features

# If we ignore the "meta" features of the headlines we're missing out on a lot of good information. These features are things like length, amount of punctuation, average word length, and other sentence specific features.
#
# Adding these in can greatly increase prediction accuracy.
#
# To add them in, we'll loop over our headlines, and apply a function to each one. Some functions will count the length of the headline in characters, and others will do more advanced things, like counting the number of digits.

# In[8]:
Ejemplo n.º 19
0
def get_kBest_f_regr(X, y):
    return SelectKBest(score_func=f_regression, k=X.shape[1]).fit(X, y)
Ejemplo n.º 20
0
# In[20]:

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

imputer = SimpleImputer(strategy='median')
constant = VarianceThreshold(
    threshold=0.0)  #Feature selector that removes all low-variance features.
min_max_scaler = MinMaxScaler()
selector = SelectKBest(f_regression)
knn = KNeighborsRegressor()

selectkbest = Pipeline([('impute', imputer), ('constant', constant),
                        ('scaler', min_max_scaler), ('select', selector),
                        ('knn_regression', knn)])

selectkbest = selectkbest.fit(X_train, y_train)

y_test_pred = selectkbest.predict(X_test)
print("\n The MSE is:\n", metrics.mean_squared_error(y_test_pred, y_test))

# Hyper-parameter tunning of the k number of features

# In[21]:
Ejemplo n.º 21
0
import pandas
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df = pandas.read_csv('../Data/customer_behavior.csv')
x = df[['bachelor', 'gender', 'age', 'salary']]
y = df['purchased'].values
sel = VarianceThreshold()
x_val = sel.fit_transform(x)
print(x_val)
print(sel.get_support())

clf = SelectKBest(chi2, k=2)
clf.fit(x, y)
x_new = clf.fit_transform(x, y)
print(clf.scores_)
print(x_new)
Ejemplo n.º 22
0
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
glass = pd.read_csv("glass.csv")

#####correlation
plt.figure(figsize=(10, 5))
corr = glass.corr()
sns.heatmap(corr, annot=True, linewidths=.2)

# feature selection
x = glass.iloc[:, 0:9]
y = glass.iloc[:, 9]
bestfeature = SelectKBest(score_func=chi2, k='all')
fit = bestfeature.fit(x, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
featurescore = pd.concat([dfcolumns, dfscores], axis=1)
featurescore.columns = ['RI', 'Score']
print(featurescore.nlargest(9, "Score"))

###feature importance
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x, y)
print(model.feature_importances_
      )  #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
Ejemplo n.º 23
0
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list)
                                       )[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature


cor_support, cor_feature = cor_selector(X, y, num_feats)
print(str(len(cor_feature)), 'selected features')

## 2) Chi-squared - between features and target, keep features with higher chi-square
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:, chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

## 3) Recursive elimination - recursively reducing/eliminating features
# frature importance value calculated each time, and lowest gets dropped

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(),
                   n_features_to_select=num_feats,
                   step=10,
                   verbose=5)
rfe_selector.fit(X_norm, y)
Ejemplo n.º 24
0
# In[12]:

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list_n, sort_keys=True)
labels, features = targetFeatureSplit(data)

# In[18]:


def skipOne(elem):
    return elem[1]


from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=5)
selector.fit(features, labels)
scores = zip(features_list_n[1:], selector.scores_)
sorted_scores = sorted(scores, key=skipOne, reverse=True)
pprint.pprint('SelectKBest scores: ')
pprint.pprint(sorted_scores)
all_features = features_list + [(i[0]) for i in sorted_scores[0:20]]
pprint.pprint(all_features)
kBest_features = features_list + [(i[0]) for i in sorted_scores[0:10]]
pprint.pprint('KBest')
pprint.pprint(kBest_features)

# In[14]:

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
Ejemplo n.º 25
0
def select_top_k_features(data, labels, n_components=1700):
    data = SelectKBest(chi2, k=n_components).fit_transform(data, labels)
    return data
labels = np.array(labels)
cv = StratifiedShuffleSplit(n_splits=1000, random_state=42)
for train_idx, test_idx in cv.split(features, labels):
    features_train, features_test = features[train_idx], features[test_idx]
    labels_train, labels_test = labels[train_idx], labels[test_idx]

### Import modules

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Make pipeline:
pipe = make_pipeline(MinMaxScaler(), SelectKBest(), LogisticRegression(random_state=42))

print "Pipe steps: \n{}".format(pipe.steps)

# parameter grid for SelectKBest:
param_grid = {'selectkbest__k': range(5,16), 'logisticregression__C': (1, 10, 100, 1000), \
              'logisticregression__solver': ('newton-cg', 'lbfgs', 'sag')}

# gridsearch and cross-validation:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=15)

# fitting:
grid.fit(features_train, labels_train)

# evaluation metrics:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, classification_report
Ejemplo n.º 27
0
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs" % (duration))  #, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()

select_chi2 = 100000
print("chi-2 ? (True/False)")
Bchi = input("chi-2 ? (True/False)")
if Bchi:  #opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" % select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print()

###############################################################################
# Benchmark classifiers
# build n-models for each classifier
# a test sample is given to all n-models.. each model classifies it and each model's accuracy is calculated separately and averaged to get the accuracy of the whole model
# ## ! when a test sample is given it does NOT get the list of probabilities from all classifiers ; it does get to know whether it belongs to a class or not; all bulk tests are given to a model and it is tested
Ejemplo n.º 28
0
    pickle.dump(clf, open(filename, 'wb'))


train_path = os.path.abspath(join(cwd, args.train))
serialization_dir = os.path.abspath(join(cwd, args.serialization_dir))
print("Load data")
X_train, y_train = load_dataset(train_path)
target_names = list(set([i[0] for i in y_train]))

print("%d documents" % len(X_train))
print("%d categories" % len(target_names))

print("Training model")
t0 = time()
transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8)
ch2 = SelectKBest(chi2, k=20000)
X_train = transformer.fit_transform(X_train)
X_train = ch2.fit_transform(X_train, y_train)

y_transformer = LabelEncoder()
y_train = [item for sublist in y_train for item in sublist]
y_train = y_transformer.fit_transform(y_train)

model = LinearSVC(C=1)
estimator = model.fit(X_train, y_train)
t1 = time() - t0
print("Train time: %0.3fs" % t1)

t0 = time()
save_model(serialization_dir + "/x_transformer.pkl", transformer)
save_model(serialization_dir + "/y_transformer.pkl", y_transformer)
Ejemplo n.º 29
0
def main(footprint_path, training_path, od):
    """

    :param footprint_path: input footprints csv
    :param training_path: input training data csv
    :param od: output directory
    :return:
    """
    plt.rcParams['figure.figsize'] = 10.5, 4

    features = pd.read_csv(footprint_path, index_col=0)
    training_label_df = pd.read_csv(training_path, index_col=0)

    training_data = features.loc[training_label_df.index]
    training_labels = training_label_df['induced']
    """ UNSUPERVISED ROUTE """

    pca = PCA(n_components=None)
    pca.fit(training_data)

    plt.title("PCA Parameter Optimization")
    plt.xlabel("Number Components Used")
    plt.ylabel("Cumulative Explained Variance Ratio")

    plt.xticks(np.arange(pca.n_components_), np.arange(1, pca.n_components_))
    plt.grid(b=True, which='major', color='0.65', linestyle='--')
    plt.axhline(0.8, color='y', linestyle='--', linewidth=2)
    plt.axhline(0.9, color='g', linestyle='--', linewidth=2)
    plt.plot(np.cumsum(pca.explained_variance_ratio_), 'bo-')

    plt.savefig(os.path.join(od, 'pca_param_optimization.png'))
    plt.clf()

    fig = plt.figure()
    plt.suptitle("Comparison of # PCA Components Kept", fontsize=18)

    ax = plt.subplot(122, projection='3d')
    ax.set_title('PCA Reduction to 3 Components')
    ax.set_xlabel("PCA Component 1")
    ax.set_ylabel("PCA Component 2")
    ax.set_zlabel("PCA Component 3")

    pca = PCA(n_components=3)
    reduced_data = pca.fit_transform(training_data)
    ax.scatter(reduced_data[:, 0],
               reduced_data[:, 1],
               reduced_data[:, 2],
               c=['r' if x else 'b' for x in training_labels],
               s=120,
               alpha=0.5)

    ax = plt.subplot(121)
    ax.set_title('PCA Reduction to 2 Components')
    ax.set_xlabel("PCA Component 1")
    ax.set_ylabel("PCA Component 2")

    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(training_data)
    ax.scatter(reduced_data[:, 0],
               reduced_data[:, 1],
               c=['r' if x else 'b' for x in training_labels],
               s=120,
               alpha=0.5)

    kmeans = KMeans(init='k-means++', n_clusters=2, n_init=1)
    kmeans.fit(reduced_data)
    h = 0.01
    x_min, x_max = reduced_data[:, 0].min() - 0.1, reduced_data[:,
                                                                0].max() + 0.1
    y_min, y_max = reduced_data[:, 1].min() - 0.1, reduced_data[:,
                                                                1].max() + 0.1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.imshow(Z,
              interpolation='nearest',
              extent=(xx.min(), xx.max(), yy.min(), yy.max()),
              cmap=plt.cm.Pastel1,
              aspect='auto',
              origin='lower')

    centroids = kmeans.cluster_centers_
    ax.scatter(centroids[:, 0],
               centroids[:, 1],
               marker='x',
               s=169,
               linewidths=3,
               color='black',
               zorder=10)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

    fig.set_tight_layout(dict(pad=5.0))
    plt.savefig(os.path.join(od, 'pca_scatter.png'))
    plt.clf()
    """ SUPERVISED ROUTE """

    kb = SelectKBest(chi2, k='all')
    kb.fit(training_data, training_labels)
    kb_df = pd.DataFrame(kb.scores_,
                         columns=['kb_score'],
                         index=training_data.columns)

    clf = ExtraTreesClassifier()
    clf = clf.fit(training_data, training_labels)
    clf_df = pd.DataFrame(clf.feature_importances_,
                          columns=['clf_score'],
                          index=training_data.columns)

    fdf = pd.concat(
        [clf_df.rank().astype(int), clf_df,
         kb_df.rank().astype(int), kb_df],
        axis=1,
        join='inner')
    fdf.columns = ['clf_rank', 'clf_score', 'kb_rank', 'kb_score']
    fdf = fdf.sort_values('clf_rank', ascending=False)

    fdf['combine_rank'] = (fdf['clf_rank'] + fdf['kb_rank']).astype(int)
    fdf = fdf.sort_values('combine_rank', ascending=False)
    fdf['path_length'] = [1 + x.count('&') for x in fdf.index]
    fdf.to_csv(os.path.join(od, 'feature_scores.csv'), float_format='%.3f')

    # In[6]:
    fig = plt.figure()
    fig.suptitle('Feature Stratification', fontsize=18)

    ax = plt.subplot(121)
    ax.set_title("Feature Score Scatter")
    ax.set_xlabel("K-Best Score")
    ax.set_ylabel("Random Forest Score")
    ax.scatter(fdf['kb_score'], fdf['clf_score'], s=120, alpha=0.6)

    ax = plt.subplot(122)
    ax.set_title("Feature Rank Scatter")
    ax.set_xlabel("K-Best Rank")
    ax.set_ylabel("Random Forest Rank")
    ax.scatter(fdf['kb_rank'],
               fdf['clf_rank'],
               s=120,
               alpha=0.6,
               c=(fdf['kb_rank'] + fdf['clf_rank']))

    fig.set_tight_layout(dict(pad=5.0))
    fig.savefig(os.path.join(od, 'feature_strat.png'))
    plt.clf()

    # train LR
    selected_features = fdf['combine_rank'].sort_values(
        ascending=False).head(4).index
    features_cut = training_data[selected_features]

    plt.suptitle("Before and After Feature Selection", fontsize=16)

    plt.subplot(1, 2, 1)
    plt.title('All Features Logistic Regression Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    loo = cross_validation.LeaveOneOut(len(training_labels))
    lr = linear_model.LogisticRegression(C=1e5)
    predicted_labels = cross_val_predict(lr,
                                         training_data,
                                         training_labels,
                                         cv=loo)
    cm = confusion_matrix(training_labels, predicted_labels)
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['uninteresting', 'interesting'], rotation=45)
    plt.yticks(tick_marks, ['uninteresting', 'interesting'])

    for i, cas in enumerate(cm):
        for j, c in enumerate(cas):
            plt.annotate(c,
                         xy=(j, i),
                         horizontalalignment='center',
                         verticalalignment='center',
                         bbox=dict(fc='w', boxstyle='round,pad=1'))

    plt.subplot(1, 2, 2)
    plt.title('Top Features Logistic Regression Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    loo = cross_validation.LeaveOneOut(len(training_labels))
    lr = linear_model.LogisticRegression(C=1e5)
    predicted_labels = cross_val_predict(lr,
                                         features_cut,
                                         training_labels,
                                         cv=loo)
    cm = confusion_matrix(training_labels, predicted_labels)
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['uninteresting', 'interesting'], rotation=45)
    plt.yticks(tick_marks, ['uninteresting', 'interesting'])

    for i, cas in enumerate(cm):
        for j, c in enumerate(cas):
            plt.annotate(c,
                         xy=(j, i),
                         horizontalalignment='center',
                         verticalalignment='center',
                         bbox=dict(fc='w', boxstyle='round,pad=1'))

    plt.tight_layout(pad=5.0)
    plt.savefig(
        os.path.join(od, 'feature_selection_classification_results.png'))
    plt.clf()
Ejemplo n.º 30
0
if DO_SVD:
    print("dimension reduction svd with d=%d" % Reduction_D)
    svd = TruncatedSVD(n_components=Reduction_D,
                       algorithm="randomized",
                       n_iterations=5,
                       random_state=None,
                       tol=0)
    data = svd.fit_transform(data)
if DO_NMF:
    print("dimension reduction nmf with d=%d" % Reduction_D)
    nmf = NMF(n_components=Reduction_D)
    data = nmf.fit_transform(data)

print("Extracting best features by a chi-squared test")
ch2NumFeatures = 1000
ch2 = SelectKBest(chi2, k=ch2NumFeatures)
# print vectorizer.get_stop_words()
data = ch2.fit_transform(data, target)
# print data

KNN = 5
nn = NearestNeighbors(n_neighbors=KNN + 1,
                      algorithm='ball_tree').fit(data.todense())
# query and data are the same so every node is counted as its most similar here
distances, indices = nn.kneighbors(data.todense())

nodeIndex = -1
nodeHomophilies = []
for neighbors in indices:
    nodeHomophily = 0
    nodeIndex += 1