Beispiel #1
0
def featureSelection(reduced_features,
                     labels,
                     clnd_features,
                     percentile,
                     n_components,
                     results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA
    from itertools import compress

    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)

    pca = PCA(n_components=n_components)
    pca.fit_transform(clnd_features, labels)

    if results == True:

        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)

        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])

        expl_var = pca.explained_variance_ratio_

        return f_stat, p_vals, expl_var
    else:
        ## return a boolean index of the retained features
        retained_features = selector.get_support()

        ## index the original features by the boolean index of top x% features
        ## return a python list of the features to be used for training
        features_list = list(compress(reduced_features[1:], retained_features))

        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0, 'poi')

        return features_list
def percentile_k_features(df, k=20):
    y = data.pop('SalePrice')
    X = data
    sel_feat = SelectPercentile(score_func=f_regression, percentile=k)
    sel_feat.fit_transform(X, y)
    n = sel_feat.get_support(indices=True).size
    return list(X.columns[np.argsort(sel_feat.scores_)[::-1]][:n])
Beispiel #3
0
    def reduce_features(self, X, y, percentile=10):
        reduce_features = SelectPercentile(chi2, percentile=percentile)
        reduce_features.fit_transform(X, y)
        mask = list(reduce_features.get_support())
        X_new = X.loc[:, mask]

        return X_new
Beispiel #4
0
class multiple_classifiers1(abstract_classifier):
    def __init__(self, data, labels):
        self.ada = AdaBoostClassifier()
        self.knn = KNeighborsClassifier(n_neighbors=1)
        self.perceptron = Perceptron(tol=1e-3)

        self.sp_knn = SelectPercentile(percentile=24)
        self.sp_ada = SelectPercentile(percentile=85)
        self.sp_percep = SelectPercentile(percentile=35)

        data_knn = self.sp_knn.fit_transform(data, labels)
        data_ada = self.sp_ada.fit_transform(data, labels)
        data_percep = self.sp_percep.fit_transform(data, labels)

        self.knn.fit(data_knn, labels)
        self.ada.fit(data_ada, labels)
        self.perceptron.fit(data_percep, labels)

    def classify(self, features):
        features_mat = features.reshape((1, -1))
        features_knn = self.sp_knn.transform(features_mat)
        features_ada = self.sp_ada.transform(features_mat)
        features_percep = self.sp_percep.transform(features_mat)

        p1 = int(self.knn.predict(features_knn)[0])
        p2 = int(self.ada.predict(features_ada)[0])
        p3 = int(self.perceptron.predict(features_percep)[0])

        avg = (p1 + p2 + p3)/3
        return bool(np.round(avg))
def percentile_k_features(df, k=20):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    features = X.columns
    sp = SelectPercentile(f_regression, percentile=k)
    sp.fit_transform(X, y)
    imp_features = [features[i] for i in np.argsort(sp.scores_)[::-1]]
    return imp_features[:7]
def percentile_k_features(X, y, k=50):
    sp = SelectPercentile(f_regression, percentile=k)
    sp.fit_transform(X, y)
    features = X.columns.values[sp.get_support()]
    scores = sp.scores_[sp.get_support()]
    fs_score = list(zip(features, scores))
    df = pd.DataFrame(fs_score, columns=['Name', 'Score'])
    return df.sort_values(['Score', 'Name'],
                          ascending=[False, True])['Name'].tolist()
Beispiel #7
0
def percentile_k_features(x_train, y_train, k=50):
    selector = SelectPercentile(f_regression,percentile=k)
    selector.fit_transform(x_train, y_train)
    scores = selector.scores_[selector.get_support()]
    features = x_train.columns.values[selector.get_support()]
    features_scores_list = list(zip(features,scores))
    df = pd.DataFrame(features_scores_list, columns=['Features','Scores'])
    sorted_list = df.sort_values('Scores',ascending=False)
    top_k_predictors = list(sorted_list['Features'])
    return top_k_predictors
    def fit(self):
        selector = SelectPercentile(f_classif, self.percent)  # 选择50的变量
        selector.fit_transform(self.X, self.Y)
        self.pvalues = selector.pvalues_
        self.indx = np.argwhere(selector.get_support())[:, 0]
        scores = -np.log10(self.pvalues)  #得到每个变量重要性p值的对数
        scores /= scores.max()
        self.scores = scores

        return self.pvalues, self.indx
    def select_features_from_model(self, x, y, percentile=10):
        score_func = chi2

        selector = SelectPercentile(score_func=score_func, percentile=percentile)
        selector.fit_transform(x, y)
        features = selector.get_support(indices=True)
        self.best_features = [column for column in x.columns[features]]
        x_select = self.select_features_in_test_set(x)

        return x_select
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA 
    from itertools import compress
    
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)
    
    pca = PCA(n_components = n_components)
    pca.fit_transform(clnd_features, labels)
    
    if results == True:
    
        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)
        
        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])
  
        expl_var = pca.explained_variance_ratio_
        
        return f_stat,p_vals,expl_var
    else:
        ## return a boolean index of the retained features 
        retained_features = selector.get_support()
        
        ## index the original features by the boolean index of top x% features 
        ## return a python list of the features to be used for training 
        features_list = list(compress(reduced_features[1:],retained_features))
    
        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0,'poi')
        
        return features_list
Beispiel #11
0
def percentile_k_features(X, y, k=50):
    selector = SelectPercentile(f_regression, percentile=k)
    selector.fit_transform(X, y)

    names = X.columns.values[selector.get_support()]
    scores = selector.scores_[selector.get_support()]
    names_scores = list(zip(names, scores))
    ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores'])
    #Sort the dataframe for better visualization
    ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'],
                                     ascending=[False, True])
    return ns_df_sorted['Feat_names'].tolist()
def percentile_k_features(df, k=20):
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    fs = SelectPercentile(f_regression, percentile=k)
    fs.fit_transform(X, y)
    support = fs.get_support()
    h = fs.scores_
    temp1 = h[support]
    full_set = X.columns.values
    temp2 = full_set[support]
    finallist1 = temp1.tolist()
    finallist2 = temp2.tolist()
    z = [x for _, x in sorted(zip(finallist1, finallist2), reverse=True)]
    return z
def percentile_k_features(df, k=20):
    predictors = df.drop(['SalePrice'], axis=1)
    target_variable = df['SalePrice']
    selector = SelectPercentile(f_regression, percentile=k)
    selector.fit_transform(predictors, target_variable)

    names = predictors.columns.values[selector.get_support()]
    scores = selector.scores_[selector.get_support()]
    names_scores = list(zip(names, scores))
    ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores'])
    #Sort the dataframe for better visualization
    ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'],
                                     ascending=[False, True])
    return ns_df_sorted['Feat_names'].tolist()
Beispiel #14
0
def univariate_feature_selection(mode, predictors, target):

    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)

    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)

    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)

    fselect.fit_transform(predictors, target)

    return fselect.pvalues_
def get_semmed_features(semmed_X, features, y):
    print("Extracting features '{}'".format(features))
    if "all" in features:
        X = semmed_X
    else:
        # Use DataFrame to preserve feature names.
        X = pd.DataFrame(index=range(semmed_X.shape[0]))
        if "cui_feature" in features:
            cui_X = semmed_X.filter(regex=(r'(SU|O)BJECT_CUI=.*'))
            X = pd.concat([X, cui_X], axis=1)
        if "cui2_feature" in features:
            raise NotImplementedError("cui2_feature")
            cui2_feature = pd.DataFrame(semmed_data["SUBJECT_CUI"].str.cat(
                semmed_data["OBJECT_CUI"], sep='_')).to_dict('records')
            cui2_feature = DictVectorizer(
                sparse=False).fit_transform(cui2_feature)
            feature_finder = SelectPercentile(chi2, percentile=10)
            cui2_feature = feature_finder.fit_transform(cui2_feature, y)
            X = np.hstack((X, cui2_feature))
        if "dist_feature" in features:
            dist_X = semmed_X.filter(regex=(r'(SU|O)BJECT_DIST'))
            X = pd.concat([X, dist_X], axis=1)
        if "dist2_feature" in features:
            raise NotImplementedError("dist2_feature")
            dist2_feature = np.abs(
                semmed_data["SUBJECT_START_INDEX"] -
                semmed_data["OBJECT_START_INDEX"]).values.reshape(-1, 1)
            dist2_feature = OneHotEncoder(
                sparse=False).fit_transform(dist2_feature)
            feature_finder = SelectPercentile(chi2, percentile=10)
            dist2_feature = feature_finder.fit_transform(dist2_feature, y)
            X = np.hstack((X, dist2_feature))
        if "pred_feature" in features:
            pred_X = semmed_X.filter(regex=(r'PREDICATE=.*'))
            X = pd.concat([X, pred_X], axis=1)
        if "ind_feature" in features:
            ind_X = semmed_X.filter(regex=(r'INDICATOR_TYPE=.*'))
            X = pd.concat([X, ind_X], axis=1)
        if "novelty_feature" in features:
            nov_X = semmed_X.filter(regex=(r'(SU|O)BJECT_NOVELTY'))
            X = pd.concat([X, nov_X], axis=1)
        if "novelty2_feature" in features:
            raise NotImplementedError("novelty2_feature")
            nov2_feature = (semmed_data["SUBJECT_NOVELTY"] +
                            semmed_data["OBJECT_NOVELTY"]).values.reshape(
                                -1, 1)
            X = np.hstack((X, nov2_feature))
    print(X.shape)
    return X
 def feature_selection_percentile(self):
     feature_names = [
         'teff', 'logg', 'feh', 'alpha', 'teff**2', 'logg**2', 'feh**2',
         'alpha**2', 'teff*logg', 'teff*feh', 'logg*feh', 'teff*alpha',
         'alpha*feh', 'logg*alpha'
     ]
     selector = SelectPercentile(f_regression, percentile=20)
     y = self.y.values
     totalscore = []
     for i, yy in enumerate(y):
         selector.fit_transform(self.X, y[:, i])
         names = [
             feature_names[i] for i in np.argsort(selector.scores_)[::-1]
         ]
         totalscore.append(selector.scores_)
Beispiel #17
0
def percentile_k_features(df, K=20):
    x = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    #selecting features on the basis of p-value i.e whose value less than percentile is true
    best_feature = SelectPercentile(f_regression, percentile=K)
    #selecting best features from X
    best_feature.fit_transform(x,y)
    #creating dataframe from score, get_support, result
    d =  {'support': best_feature.get_support(),'values':best_feature.scores_}
    df1 = pd.DataFrame(d,index = x.columns)
    #sorting values according get_support
    df1 = df1.sort_values('values', ascending=False)
    #selecting only rows whose value of support is True
    col = df1[df1.support].index
    return list(col) # returning list of features 
def percentile_k_features(X, y, k=50):

    lst = []

    fs = SelectPercentile(f_regression, percentile=k)
    fs.fit_transform(X, y)
    col_nam = X.columns.values[fs.get_support()]
    col_scr = fs.scores_[fs.get_support()]
    nam_scr = list(zip(col_nam, col_scr))
    #print nam_scr

    srt_nam_scr = sorted(nam_scr, key=lambda x: x[1], reverse=True)
    for i in srt_nam_scr:
        lst.append(i[0])

    return lst
    def test_select_percentile_chi2(self):

        X, y = load_digits(return_X_y=True)
        selector = SelectPercentile(chi2, percentile=15)
        selector.fit_transform(X, y)
        data_tensor = torch.from_numpy(X)

        torch_model = hummingbird.ml.convert(selector, "torch")

        self.assertIsNotNone(torch_model)
        np.testing.assert_allclose(
            selector.transform(X),
            torch_model.transform(data_tensor),
            rtol=1e-06,
            atol=1e-06,
        )
Beispiel #20
0
 def feature_select(self):
     b = SelectPercentile(f_classif, percentile=task.percentile)
     y = np.array(self.results[self.task.label].data)
     X = np.array(self.results[self.task.features].data)
     data = pd.DataFrame(b.fit_transform(X, y))
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
Beispiel #21
0
def build_data(percentile_of_features):
    with open("data/email_authors.pkl",
              'rb') as authors_file, open("data/word_data.pkl",
                                          'rb') as word_file:
        email_authors = pickle.load(authors_file)
        word_data = pickle.load(word_file)

    # split into training and test
    features_train, features_test, labels_train, labels_test = train_test_split(
        word_data, email_authors, test_size=0.1, random_state=42)

    # tokenize emails
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    # only use top 10% of features
    selector = SelectPercentile(percentile=percentile_of_features)
    features_train_transformed = selector.fit_transform(
        features_train_transformed, labels_train).toarray()
    features_test_transformed = selector.transform(
        features_test_transformed).toarray()

    return [
        features_train_transformed, features_test_transformed, labels_train,
        labels_test
    ]
Beispiel #22
0
def percentile_k_features(data, k=20):
    X = data.drop(['SalePrice'], axis=1)
    y = data['SalePrice']

    SP = SelectPercentile(f_regression, percentile=k)
    model = SP.fit_transform(X, y)
    return model
Beispiel #23
0
 def feature_select(self):
     b = SelectPercentile(f_classif, percentile=task.percentile)
     y = np.array(self.results[self.task.label].data)
     X = np.array(self.results[self.task.features].data)
     data = pd.DataFrame(b.fit_transform(X, y))
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
Beispiel #24
0
 def get_features_by_score(self):
     selector = SelectPercentile()
     features = selector.fit_transform(X=self.X_train, y=self.Y_train)
     feature_names = self.X_train.columns.tolist()
     self.allFeatureByScore = [
         feature_names[i] for i in np.argsort(selector.scores_)[::-1]
     ]
Beispiel #25
0
def percentile_k_features(df, k=20):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    feature_sel = SelectPercentile(f_regression, percentile=k)
    a = feature_sel.fit_transform(X, y)
    #print a
    column = data.columns
    feature_ind = feature_sel.get_support(indices=True)
    feature_score = feature_sel.scores_[feature_ind]
    #feature_ind = [column[x] for x in feature_sel.get_support(indices=True) if x]
    #print feature_ind
    #print feature_score
    feature_ind_score = zip(feature_score, feature_ind)
    feature_ind_score.sort(key=itemgetter(0), reverse=True)
    #print feature_ind_score
    k_score, k_ind = zip(*feature_ind_score)
    #print k_ind
    k_features = []
    for x in k_ind:
        k_features.append(column[x])

    #fis_sort = sorted(feature_ind_score.values(), key=operator.itemgetter(0),reverse=True)
    #k_ind = [feature_ind for feature_ind in feature_ind_score]
    #print fis_sort
    return k_features
    def selectTrainEvaluate(self, trainData, trainTarget, testData,
                            classifier):
        feature_grid = [10, 20, 30, 40, 50, 60, 70]
        scoreMax = 0
        bestSelector = ' '
        bestPca = ' '
        bestSvc = ' '
        bestPerc = 0
        for item in feature_grid:
            self.logger.log("INFO", "TRYING FEATURE PERCENTILE " + str(item))
            selector = SelectPercentile(mutual_info_classif, percentile=item)
            trainDataSel = selector.fit_transform(trainData, trainTarget)
            pca = PCA(n_components=0.99, svd_solver='full')
            trainDataSel = pca.fit_transform(trainDataSel)
            result = self.trainClassifier(trainDataSel, trainTarget,
                                          classifier)
            self.logger.log(
                "INFO", "DONE FEATURE PERCENTILE " + str(item) + " SCORE: " +
                str(result["score"]))
            if (result["score"] > scoreMax):
                bestPerc = item
                scoreMax = result["score"]
                bestSelector = selector
                bestPca = pca
                bestSvc = result["svc"]

        self.logger.log("INFO", "DONE FEATURE CROSS VALIDATION")
        self.logger.log("INFO",
                        "BEST RESULT WITH " + str(bestPerc) + " PERCENTILE")
        testDataSel = bestSelector.transform(testData)
        testData = bestPca.transform(testDataSel)
        pred = bestSvc.predict(testData)
        return (pred, bestSvc, bestSelector, bestPca)
Beispiel #27
0
def ridge_make_submission():
    train, test = read_csv()
    x_train, y_train = make_train_set(train)
    x_test, y_test = make_train_set(test)
    y_train = y_train['Y']
    # feature selection
    sel = SelectPercentile(f_regression, 70)
    x_train = sel.fit_transform(x_train, y_train)
    x_test = sel.transform(x_test)

    model = linear_model.Ridge(normalize=True)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    # preds[preds < 0] = 0
    y_test['Y'] = preds
    print(y_test['Y'].var())
    y_test.columns = ['TERMINALNO', 'Pred']
    y_test.set_index('TERMINALNO', inplace=True)
    # x_test = pd.merge(x_test, y_test, left_index=True, right_index=True)
    # x_test.set_index('TERMINALNO', inplace=True)
    # print(x_test.head())
    y_test.to_csv(path_test_out,
                  columns=['Pred'],
                  index=True,
                  index_label=['Id'])
def preprocess(word_data, targets):
    print("\n### PREPROCESSING DATA ###")

    # vectorize
    print("-- Vectorization")
    vectorizer = TfidfVectorizer(sublinear_tf=True)  # , stop_words='english'
    data_transformed = vectorizer.fit_transform(word_data)

    # feature selection
    print("-- Feature Selection")
    selector = SelectPercentile(percentile=5)
    data_selected = selector.fit_transform(data_transformed, targets)
    if data_selected.shape[1] == 0:
        data_selected = data_transformed
    else:
        print("Top {} features were selected".format(data_selected.shape[1]))

        # print top features
        nr_features = 30
        i = selector.scores_.argsort()[::-1][:nr_features]
        top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i],
                                        selector.scores_[i],
                                        selector.pvalues_[i]))
        print("\nTop %i Features:" % nr_features)
        print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n")

    features_train, features_test, labels_train, labels_test = \
        train_test_split(data_selected, targets, test_size=0.2, stratify=targets)

    return features_train, features_test, labels_train, labels_test
Beispiel #29
0
def main():

    main_data = pd.read_csv('../data/train.csv', index_col='ID')

    output = []
    for x in main_data.columns:
        output.append({
            'variable': x,
            'variance': main_data.ix[:, x].var(),
            'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4),
            'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))}
        )

    # print csv for later in the presentation docs
    variable_selector = pd.DataFrame(output)
    variable_selector = variable_selector.set_index('variable')
    variable_selector = variable_selector.drop('TARGET')
    variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv')

    selector = SelectPercentile(f_classif, percentile=25)
    subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET']))

    subset.to_csv('../data/main_data.csv', index=False)
    main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False)

    # print transformed test data to csv
    test_data = pd.read_csv('../data/test.csv', index_col='ID')
    test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index)
    test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
Beispiel #30
0
def buildVectorizer(classes, examples, parameters):
	featureChoice = None
	doFeatureSelection = False
	tfidf = False
	featureSelectPerc = 10
		
	if "featureChoice" in parameters:
		featureChoice = parameters["featureChoice"]
	if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True":
		doFeatureSelection = True
	if "featureSelectPerc" in parameters:
		featureSelectPerc = int(parameters["featureSelectPerc"])
	if "tfidf" in parameters and parameters["tfidf"] == "True":
		tfidf = True
		
	print "Starting vectorizer..."
	vectorizer = Vectorizer(classes,examples,featureChoice,tfidf)
	vectors = vectorizer.getTrainingVectors()
	print "Vectors of size:", vectors.shape

	if doFeatureSelection:
		print "Trimming training vectors..."
		from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2
		#featureSelector = SelectKBest(chi2, k=100)`:
		featureSelector = SelectPercentile(chi2,featureSelectPerc)
		vectorsTrimmed = featureSelector.fit_transform(vectors, classes)
		vectorsTrimmed = coo_matrix(vectorsTrimmed)
		print "Trimmed training vectors of size:", vectorsTrimmed.shape
	else:
		vectorsTrimmed = vectors
		featureSelector = None

	return vectorsTrimmed,vectorizer,featureSelector
Beispiel #31
0
def select_features(filename, column, percentile, features_outfile=None):
    '''
    Selects the top <percentile> features from the dataset.
    :param str filename: Training data file.
    :param str column: Column in the CSV to use. If 'all' use all columns.
    :param int percentile: Percentile top features to choose.
    :returns: Training data with top percentile features. Labels. Names of selected features.
    :rtype: 3-tuple
    '''
    train = pd.read_csv(filename, sep=',', compression="infer")
    targets = LabelBinarizer().fit_transform(train["T/F"])
    targets = np.ravel(targets)
    train.drop(["T/F"], axis=1, inplace=True)
    # Encase the feature names in quotes to ensure proper parsing later.
    feature_names = np.array(
        ['"{}"'.format(f) for f in np.array(train.columns)])
    train = train.values.astype('double')
    if percentile < 100:
        feature_finder = SelectPercentile(f_classif, percentile=percentile)
        train = feature_finder.fit_transform(train, targets)
        support = feature_finder.get_support()
        scores = feature_finder.scores_
        pvals = feature_finder.pvalues_
        feature_names = feature_names[support]
        if features_outfile is not None:
            feature_scores = scores[support]
            feature_pvals = pvals[support]
            features = zip(features, feature_scores, feature_pvals)
            rank = sorted(features, key=lambda x: x[1], reverse=True)
            with open(features_outfile, 'w') as outF:
                for feat in rank:
                    outF.write("{} :: {:g} :: {:g}\n".format(*feat))
    return train, targets, feature_names
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file):
    sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name)
    y = []
    X = sorted_train_data.iloc[:,1:]
    fip = open('data/' + train_label_file)
    lines = fip.readlines()
    for line in lines:
        line = line.rstrip()
        y.append(int(line))

    print("Final feature reduction: {:s}".format(reduced_feature_file_name))
    print("Training labels length: {:d}".format(len(y)))
    print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1]))
    print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1]))

    # find the top 10 percent variance features, from ~1000 -> ~100 features
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1]))
    
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1

    #data_reduced = sorted_train_data.iloc[:,[0] + selected_names]
    #Does not put the file_name as the first column.
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['file_name'])
    data_reduced = data_fnames.join(data_trimmed)
    
    data_reduced.to_csv('data/' + final_file_name, index=False)
    print("Completed reduction in {:s}".format(final_file_name))
    
    return
def selectionPercentile(X, y, paramlist):
    percentile = paramlist['percentile']
    spc = SelectPercentile(chi2, percentile=percentile)
    Xnew = spc.fit_transform(X, y)
    indexarr = spc.get_support(indices=True)
    scores_arr = spc.scores_
    return [Xnew, indexarr, scores_arr]
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file):
    sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name)
    y = get_training_labels('data/' + reduced_feature_file_name, train_label_file)
    X = sorted_train_data.iloc[:,1:]

    print("Final feature reduction: {:s}".format(reduced_feature_file_name))
    print("Training labels length: {:d}".format(len(y)))
    print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1]))
    print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1]))

    # find the top 10 percent variance features, from ~1000 -> ~100 features
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1]))
    
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1

    #data_reduced = sorted_train_data.iloc[:,[0] + selected_names]
    #Does not put the file_name as the first column.
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    
    data_reduced.to_csv('data/' + final_file_name, index=False)
    print("Completed reduction in {:s}".format(final_file_name))
    
    return
Beispiel #35
0
def main(path):
    datatrain = get_data(path)
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english',
                                 max_features=6000,
                                 strip_accents='unicode')
    # Calculating weights
    data_weighted = vectorizer.fit_transform(datatrain.data)

    # Build feature selection
    feature_selection = SelectPercentile(f_classif, percentile=20)
    data_weighted = feature_selection.fit_transform(data_weighted,
                                                    datatrain['values'])

    # Train with known data
    clf = LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3)
    clf.fit(data_weighted, datatrain['values'])

    # Save training model
    if not os.path.exists('training'):
        os.mkdir('training')

    filename = 'training/{0}.pkl'.format(int(time.time()))
    joblib.dump(
        {
            'clf': clf,
            'vectorizer': vectorizer,
            'feature_selection': feature_selection
        },
        filename,
        compress=9)
Beispiel #36
0
def main():
    parser = argparse.ArgumentParser(description='Feature Selection') 
    required = parser.add_argument_group('required options') 
    
    required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') 
    required.add_argument('-y', '--targetdata', required=True, help='File containiing target data')
    required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features')
    
    args = parser.parse_args()

    X = np.loadtxt(args.scaledfeaturelist) 
    Y = np.genfromtxt(args.targetdata,dtype='str')
   
    #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y)
    sel = SelectPercentile(f_classif, percentile=args.fetpercentile)
    result = sel.fit_transform(X,Y)
    
    #selecting features for test programs
    if os.path.isfile('variancefeatures.txt'):
        varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str')
        featureFromSelectPercentile = sel.get_support(indices=True)
        featureFileforSelectPercentile = open("featuresToTestPrograms","w")
        for i in featureFromSelectPercentile:
            featureFileforSelectPercentile.write(varianceFeature[i])
            featureFileforSelectPercentile.write("\n")
        featureFileforSelectPercentile.close()   
    #remove the variancefeatures as we don't need it anymore
    rm variancefeatures.txt

    np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
Beispiel #37
0
def build_linear_model(X, y, analyzerType):
	tfv = vectorizer(analyzerType)
	select = SelectPercentile(score_func=chi2, percentile=15)
	clf = SVC(C=12.0, kernel='linear')

	X = tfv.fit_transform(X)
	X = select.fit_transform(X, y)
	return (clf.fit(X, y), tfv, select)
def test_selectpercentile_tiebreaking():
    """Test if SelectPercentile selects the right n_features in case of ties.
    """
    Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
    y = [1]
    dummy_score = lambda X, y: (X[0], X[0])
    for X in Xs:
        with warnings.catch_warnings(record=True):
            sel = SelectPercentile(dummy_score, percentile=34)
            X1 = sel.fit_transform([X], y)
            assert_equal(X1.shape[1], 1)
            assert_best_scores_kept(sel)

            sel = SelectPercentile(dummy_score, percentile=67)
            X2 = sel.fit_transform([X], y)
            assert_equal(X2.shape[1], 2)
            assert_best_scores_kept(sel)
Beispiel #39
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
def main(path,filename):

	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5']
	percentil = 20
	X = []
	y = []
	lens = []
	load_batch(y,path,'clases',filename) 
	y = [j for i in y for j in i]
	for batch in batchs:
		load_batch(X,path,batch,filename)
		lens.append(len(X[0]))
	
	total = [lens[0]]
	for i in xrange(1,len(lens)):
		total.append(lens[i]-lens[i-1])
	print 'Cantidad de atributos por barch'
	print total
	sp = SelectPercentile(chi2,percentil)
	X_new = sp.fit_transform(X, y)
	sup = sp.get_support(True)
	#print sup
	res = [0]* len(batchs)
	for i in sup:
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res[j] +=1
				break
	porcentajes = []
	for i in xrange(0,len(lens)):
		porcentajes.append((1.0*res[i])/total[i])
	print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print res

	print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print porcentajes
	
	clf = ExtraTreesClassifier()
	clf = clf.fit(X, y)
	fi = clf.feature_importances_

	res2 = [0]* len(batchs)
	for i in xrange(0,len(fi)):
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res2[j] += fi[i]
				break
	print 'Importancia porcentual acumulada de la seleccion multivariada'
	print res2
	porcentajes2 = []
	for i in xrange(0,len(lens)):
		porcentajes2.append((1.0*res2[i])/total[i])

	print 'Importancia porcentual promedio por variable de la seleccion multivariada'
	print porcentajes2
Beispiel #41
0
def fTestFeatureSelection(train_files, train_labels, test_files, test_labels):
    design_matrix, features, _ = vectorizeTrain(train_files, None, 0, False, 0, None)
    classifier = LogisticRegression()
    for p in range(10):
        percentile = 100-p*10
        print 'Selecting {0}% of features'.format(percentile)
        feat_sel = SelectPercentile(f_regression, percentile)
        X_sel = feat_sel.fit_transform(design_matrix, train_labels)
        f_inds = feat_sel.get_support(indices=True)
        print 'Using {0} features'.format(len(f_inds))
        classifier.fit(X_sel, train_labels)
        test(test_files, test_labels, classifier, [features[d] for d in f_inds], None, 0, False, 0, None)
Beispiel #42
0
def cross_val_score(clf, data, target, k):
	shuffle_arr = []
	size = len(data)
	for i in range(size):
		shuffle_arr.append(i)
	scores = []
	for i in range(0, k):
		#generate shuffled train and test dataset
		data_train_raw = []
		data_test_raw = []
		target_train = []
		target_test = []
		# seperate shuffled train and test dataset
                random.shuffle(shuffle_arr)
                shuffle_train = shuffle_arr[:size - size/k]
                shuffle_test = shuffle_arr[size-size/k :]
                for j in shuffle_train:
                        data_train_raw.append(data_total[j])
                        target_train.append(target[j])
                for r in shuffle_test:
                        data_test_raw.append(data_total[r])
                        target_test.append(target[r])

		data_train = data_process(data_train_raw)
		data_test = data_process(data_test_raw)

		# transform array of string to counts
		count_vect = CountVectorizer()
		X_train_counts = count_vect.fit_transform(data_train)
		# transform counts to frequencies
		tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
		X_train_tf = tf_transformer.transform(X_train_counts)
		
		# feature selection
		select = SelectPercentile(chi2, percentile = 10)
		X_train_fs = select.fit_transform(X_train_tf, target_train)
							
		# train the model
		clf_train = clf.fit(X_train_fs, target_train)

		# test the model
		X_new_counts = count_vect.transform(data_test)
		X_new_tfidf = tf_transformer.transform(X_new_counts)
		X_new_fs = select.transform(X_new_tfidf)
		test_result = clf_train.predict(X_new_fs)
		scores.append(GetPrecisionRecallF1(test_result, target_test))
		#clf_score =  clf_train.score(X_new_fs, target_test)
		#scores.append(clf_score)
	return scores
    def featureSelection(self,X,y):
        '''
         Feature selection recursive feature elimination with Linear SVM.
        :param:
         a. X the training matrix.
         b. y the labels column corresponding the X.
        :return: 
            a. The mask of top 10% features using.
            b. The transformed training matrix
        '''

        print np.shape(X)

        selector = SelectPercentile(chi2, percentile=10)
        X_new = selector.fit_transform(X, y)
        
        return X_new, selector.get_support()
def feature_reduction_percent(percentage, train_data_df, train_labels_df):
    # TODO: everythong
    X = train_data_df.iloc[:,1:]
    y = np.array(train_labels_df.iloc[:,1])

    # find the top percent variance features.
    fsp = SelectPercentile(chi2, percentage)
    
    X_reduced = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False)

    return
def do_feature_selection(train_instances, test_instances, folds, clf, param_grid, dense, outfile):
    groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys())

    X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)
    X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)

    all_tpr = []

    (chi2values, pval) =  chi2(X_train, y_train)
    feature_indices = [i[0] for i in sorted(enumerate(pval), key=lambda x:x[1])]
    index_to_name = {v:k for k, v in feature_space.items()}
    feature_names = [index_to_name[i] for i in feature_indices]

    print feature_indices[0:200]
    print feature_names[0:200]

    for percentile in range(1, 10, 2):
            t0 = time()
            ch2 = SelectPercentile(chi2, percentile=percentile)
            X_train_trans = ch2.fit_transform(X_train, y_train)
            print("done in %fs" % (time() - t0))

            model = get_optimal_model (X_train_trans, y_train, folds, clf, param_grid, 'roc_auc')

            X_test_trans = ch2.transform(X_test)

            scores = get_scores(model, X_test_trans)
            fpr, tpr, thresholds = roc_curve(y_test, scores)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label='%d  (area = %0.4f)' % (percentile, roc_auc))
            print "\n"*4


    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.savefig('feature_selection.png')
    
    print()
Beispiel #46
0
def fit_predict(trn, dev, args, emb):
    rng = np.random.RandomState(7)
    Xtrn = np.array([emb.get_context(i,sent, args['embs'], args['e_context']) for sent in trn for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m])
    Xdev = np.array([emb.get_context(i,sent, args['embs'], args['e_context']) for sent in dev for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m])
    ytrn = np.array([lbl for sent in trn for m,lbl in zip(sent['ii'],sent['ls']) if m])
    ydev = np.array([lbl for sent in dev for m,lbl in zip(sent['ii'],sent['ls']) if m])
    logging.debug('embs: Xtrn_emb:{} Xdev_emb: {}'.format(Xtrn.shape, Xdev.shape))

    if len(args['feats']):
        feat = Feat(trn, args)
        dvec = DictVectorizer(sparse=False)
        # dvec.fit(feat.get_features(i, sent) for sent in trn for i,w in enumerate(sent['ws']))

        Xtrn_feat = dvec.fit_transform(feat.get_features(i, sent) for sent in trn for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m)
        Xdev_feat = dvec.transform(feat.get_features(i, sent) for sent in dev for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m)
        logging.debug('Xtrn_feat shape:{}'.format(Xtrn_feat.shape))
        logging.debug('Xdev_feat shape:{}'.format(Xdev_feat.shape))


        assert (Xtrn.std(axis=0)==0).sum() == 0

        if args['percentile'] < 100:
            from sklearn.feature_selection import chi2, f_classif, SelectPercentile
            sel = SelectPercentile(chi2, percentile=args['percentile'])
            Xtrn_feat, Xdev_feat = sel.fit_transform(Xtrn_feat, ytrn), sel.transform(Xdev_feat)
            logging.debug('after sel: Xtrnf:{} Xdevf: {}'.format(Xtrn_feat.shape, Xdev_feat.shape))
            logging.debug([(fea, score) for score, fea in islice(reversed(sorted(zip(sel.scores_, dvec.feature_names_))), 100)])

        Xtrn = np.hstack((Xtrn,Xtrn_feat))
        Xdev = np.hstack((Xdev,Xdev_feat))
        logging.debug('after feats: Xtrn:{} Xdev: {}'.format(Xtrn.shape, Xdev.shape))


    cweights = 'balanced' if args['cweights'] else {0:1,1:1}
    if args['kerntype'] == 'lin':
        clf = LinearSVC(C=args['C'], class_weight=cweights, random_state=rng)
    else:
        clf = SVC(class_weight=cweights, C=args['C'], kernel=args['kerntype'], gamma=args['kerngamma'], degree=args['kerndegree'], random_state=rng)

    clf.fit(Xtrn, ytrn)

    return clf.predict(Xtrn), clf.predict(Xdev)
def call_GridParamSearch_featfilt(X, y) :
    '''
        (def is Currently just a cut & paste from "main".)
        Calles def GridParamSearch , (which uses randomized CV to find odel param)
    Used to try different ml models, then get their optimal paramters
    '''
    print("SPARSE (L1) EXT gridparam scores:")
    #   clf = Pipeline([
    #       ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')),
    # ('classification', ExtraTreesClassifier(n_jobs=3)
    #   )])
    'Sparse; L1 penalized features selection prior to RF fitting/prediction'
    clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto')
    clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto')

    'http://scikit-learn.org/0.13/auto_examples/plot_feature_selection.html'
    print('Original features matrix:')
    print(X.shape)
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 20% most significant features
    # selector = SelectPercentile(f_classif, percentile=20)
    selector = SelectPercentile(chi2, percentile=20)
    X_anova = selector.fit_transform(X, y)
    print(
        'New (2 f_classif) Using statistical feature selection: features matrix is:')
    print(X_anova.shape)

    # lda = LDA(n_components=10)
    # X_lda = lda.fit_transform(X, y)
    # print('New LDA filtered features matrix:')
    # print(X_lda.shape)

    X_svm = clf_svm.fit_transform(X, y)  #Get Sparse feature selections..
    # print(clf.feature_importances_ )
    print('New sparse (SVM filtered) features matrix:')
    print(X_svm.shape)

    print("Res of SVM fitting of (F scores filtered =2) for more feature selection:")
    X_doubleFilt_svm_f = clf_svm.fit_transform(X_anova, y)
    print(X_doubleFilt_svm_f.shape)
    print("param search on sparse features matrix")
    GridParamSearch(param_dist=Tree_param_dist, clf=clf_EXT, X=X_svm, y=y)
def featureSelectionProcess(X,Y,featureSelection):
    print "feature selection process: "+str(featureSelection)
    print "before feature selection. shape of X"+str(X[0].shape)
    if featureSelection == "linearSVM":
        X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, Y)
        print  "after feature selection. shape of X"+str(X_new[0].shape)
    elif featureSelection == "SelectKBest":
        X_new = SelectKBest(chi2, k=6).fit_transform(X, Y)
        print  "after feature selection. shape of X"+str(X_new[0].shape)
    elif featureSelection == "SelectKPercentile":
        selector = SelectPercentile(f_classif, percentile=30)
        X_new = selector.fit_transform(X, Y)
        print  "after feature selection. shape of X"+str(X_new[0].shape)
    elif featureSelection == "TreeBased":
        clf = ExtraTreesClassifier()
        X_new = clf.fit(X, Y).transform(X)
        print  "after feature selection. shape of X"+str(X_new[0].shape)
    elif featureSelection == "Recursive":
        svc = SVC(kernel="linear", C=1)
        rfe = RFE(estimator=svc, n_features_to_select=5, step=1)
        X_new = rfe.fit(X, Y).transform(X)
        print  "after feature selection. shape of X"+str(X_new[0].shape)
Beispiel #49
0
class AnovaPercentileStep(SklearnStep):
    def __init__(self, percentile):
        super(AnovaPercentileStep, self).__init__()
        self._percentile = percentile

    def fit_transform(self):
        self._model = SelectPercentile(f_classif, self._percentile)
        x, y = load_svmlight(self.input_path)
        x = self._model.fit_transform(x, y)
        save_svmlight(x, y, self._output_path)

    def transform(self, x=None):
        if x is None:
            x, y = load_svmlight(self._test_input_path)
            x = self._model.transform(x)
            save_svmlight(x, y, self._test_output_path)
        else:
            transformed_x = self._model.transform(x)
            return transformed_x

    def get_param(self):
        return {'percentile': self._percentile}
Beispiel #50
0
 def load_data(self):
   # preprocessing train data
   df = pd.read_csv(BASE_DIR + INPUT_TRAIN)
   X = df.values.copy()
   np.random.shuffle(X)
   self.ids, X, labels = X[:, 0], X[:, 1:-1].astype(np.float32), X[:, -1]
   self.encoder = LabelEncoder()
   self.y_true = self.encoder.fit_transform(labels).astype(np.int32)
   scaler = MinMaxScaler()
   self.train = scaler.fit_transform(X)
   selector = SelectPercentile(f_classif, percentile=20)
   self.train = selector.fit_transform(X,self.y_true)
   # preprocessing test data
   df = pd.read_csv(BASE_DIR + INPUT_TEST)
   X = df.values.copy()
   X, self.idx = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
   self.test = scaler.transform(X)
   self.test = selector.transform(X)
   
   self.num_class = len(self.encoder.classes_)
   self.n_samples = len(self.y_true)
   self.y_pred = np.zeros( (self.n_samples, self.num_class) )
   return None
Beispiel #51
0
def main(path):
    datatrain = get_data(path)
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english', max_features=6000,
                                 strip_accents='unicode')
    # Calculating weights
    data_weighted = vectorizer.fit_transform(datatrain.data)

    # Build feature selection
    feature_selection = SelectPercentile(f_classif, percentile=20)
    data_weighted = feature_selection.fit_transform(data_weighted, datatrain['values'])

    # Train with known data
    clf = LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3)
    clf.fit(data_weighted, datatrain['values'])

    # Save training model
    if not os.path.exists('training'):
        os.mkdir('training')

    filename = 'training/{0}.pkl'.format(int(time.time()))
    joblib.dump({'clf': clf,
                 'vectorizer': vectorizer,
                 'feature_selection': feature_selection}, filename, compress=9)
Beispiel #52
0
print "    % | Precision | Recall | Features"
print "="*38;

features_score = { } # feature: score
best_results = { 
        'precision' : 0.0,
        'recall'    : 0.0,
        'features'  : [],
        'percent'   : 0.0
}
for percent in range(0, 101, 5):
    if percent == 0:
        continue

    fs = SelectPercentile(f_classif, percentile=percent)
    features_transformed = fs.fit_transform(features, labels)

    best_features = []
    counter = 0
    for idx, score in sorted(enumerate(fs.scores_), key=lambda score: score[1], reverse=True):
        if len(features_score.keys()) < len(features_list):
            features_score[features_list[idx+1]] = score

        counter = counter + 1
        if counter > len(features_transformed[0]):
            continue
        best_features.append(features_list[idx+1])

    ## DECISION TREE
    if len(best_features) < 2:
        continue; # can't have less than 2 features for a decision tree
testfeaturevectors = varthresh.transform(testfeaturevectors)
"""

#classifiers = [BernoulliNB(), DecisionTreeClassifier(), LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ]
#classifier_names = ["BernoulliNB", "DecisionTreeClassifier", "MaxEnt", "SVM"]

#classifiers = [LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ]
#classifier_names = ["MaxEnt", "SVM"]

classifiers = [LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ]
classifier_names = ["MaxEnt", "SVM"]

allscores = list()
for keep in range(3, 30, 3):
    selector = SelectPercentile(chi2, keep)
    selected_trainfeaturevectors = selector.fit_transform(trainfeaturevectors, trainLabels)
    selected_testfeaturevectors = selector.transform(testfeaturevectors)

    scores = list()
    for c in range(len(classifiers)):
        classifier = classifiers[c]

        classifier.fit(selected_trainfeaturevectors, trainLabels)
        scores.append((classifier.score(selected_testfeaturevectors, testLabels), keep, classifier_names[c], c))

    print(str(keep)+"\t"+"\t".join(str(s[0]) for s in scores))
    allscores += scores

print("saving the best classifier:")

bestsettings = max(allscores)
def reduce_feature_set(feature_set_file, train_label_file, token_file, reduced_set_file, temp_train_labels):
    # Use chi2 tests to determine the 10% best features see (mmmc/feature-reduction-call-graphs.ipynb).
    # Ok, so we still have 100000+ features even after severely reducing the function name lengths.
    # This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
    # Solution 1: rent an AWS server with plenty-o-ram. (costs money and requires high bandwidth for file transfer)
    # Solution 2: buy more RAM for my linux box. (costs money)
    # Solution 3: break the sparse matrix into smaller chunks and process individually. (Ok)
    # Solution 4: try the pandas sparse matrix data structure. (too slow)
    
    # -> Solution 3: slice the matrix into smaller chunks for processing.
    # the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
    
    # Procedure:
    # 1. Open the PE header feature file.
    # 2. Open the PE header token file and get the number of column names.
    # 3. Divide the number of columns by 10 to get the column subset length.
    # 4. Load the malware label set.
    # 5. Use pandas to load and sort each column subset.
    # 6. Do the chi2 tests to reduce each column subset to 10 percent best features.
    # 7. Recombine the column subsets.
    # 8. Perform the chi2 test again on the combined reduced feature set.
    # 9. Write out the final reduced feature set to a csv file.
    
    # Open PE header token file and get a list of token names.
    hdr_pd = pd.read_csv('data/' + token_file, na_filter=False) # Do not do NaN filtering or we will get floats instead of text.
    token_list = list(hdr_pd['token_name'])
    token_list_len = len(token_list)
    for idx, token in enumerate(token_list): # Clamp the token name length and demangle C++ names, they are annoying.
        #token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            token_list[idx] = token[:32]
        else:
            token_list[idx] = token
    
    # Load training labels
    sorted_train_labels = pd.read_csv("data/" + train_label_file)
    #sorted_train_labels.head()
        
    # Load and sort the malware sample names.
    sample_names = pd.read_csv(feature_set_file, usecols = [0], na_filter=False)
    sorted_sample_names = sample_names.sort('file_name')
    
    # Now get the labels of the PE malware samples from the label set.
    counter = 0
    y = []
    #train_names = sorted_train_labels['family_label']
    for fname in sorted_sample_names['file_name']:
        counter += 1
        if counter % 10000 == 1:
            print("Appending {:d} -> {:s}".format(counter, fname))
        for idx, fname2 in enumerate(sorted_train_labels['file_name']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
                break
    
    ###############################
    # Write out the PE/COFF sample train labels for later use and validation.
    fop = open('data/' + temp_train_labels, 'w')
    fop.writelines("\n".join(str(x) for x in y))
    fop.close()
    ###############################
    
    # Load column subset and sort, then 
    # Perform chi2 test to get 10% best features.
    
    onetenth = int(token_list_len / 10)
    startidx = 1 # skip the filename column
    endidx = onetenth

    for idx in range(0,10):
        print("Processing column set {:d} -> {:d}".format(startidx, endidx))
        column_numbers = [ 0 ] + list(range(startidx, endidx, 1))
        feature_subset = pd.read_csv(feature_set_file, usecols = column_numbers)
        
        # Sort the feature subset on file_name column.
        sorted_feature_subset = feature_subset.sort('file_name')
        
        X = sorted_feature_subset.iloc[:,1:] # skip the filename, get the family class label for this feature subset.

        # Find the top 10 percent variance features.
        print("Sorted feature subset - slice {:d} of 10.".format(idx))
        print("Subset shape: {:d} {:d}".format(X.shape[0], X.shape[1]))
        print("Length of y: {:d}".format(len(y)))
        #sorted_feature_subset.head()
        
        # Now select the 10% best features for this feature subset.
        # Try to make the subset file sizes smaller.
        fsp = SelectPercentile(chi2, 10)
        X_new_10 = fsp.fit_transform(X,y)
        selected_names = fsp.get_support(indices=True)
        selected_names = selected_names + 1 # the column name indices start at 0 so add 1 to all.
        
        data_trimmed = sorted_feature_subset.iloc[:,selected_names]
        data_fnames = pd.DataFrame(sorted_feature_subset['file_name'])
        data_reduced = data_fnames.join(data_trimmed)
        
        # Write to file as we do not have enough memory.
        filename = "data/pe-header-temp-" + str(idx) + "-10perc.csv"
        data_reduced.to_csv(filename, index=False)
        
        # TEST AND VALIDATION ONLY.
        ############################################
        #out_subset = sorted_feature_subset.iloc[:,0:2]
        #out_subset.to_csv(filename, index=False)
        print("Writing reduced feature file: {:s}".format(filename))
        ############################################
        
        startidx = endidx
        endidx += onetenth


    return
def main(): 
    print 'loading data'
    
    alltext = []
    traindata = p.read_table('train.tsv').replace('?',0)
    traindata['alchemy_category'] = traindata.groupby('alchemy_category').grouper.group_info[0]
    traindata['alchemy_category_score'] = traindata['alchemy_category_score'].astype(float)
    traindata = outlier(np.array(traindata),24)
    print 'no of rows after outlier removal =',len(traindata)
    # traindata = list(np.array(p.read_table('train.tsv'))[:,2])
    testlabels = list(np.array(p.read_table('test.tsv'))[:,1])
    testdata = list(np.array(p.read_table('test.tsv'))[:,2])
    trainlabels = traindata[:,-1]
    traindata = list(traindata[:,2])   
    alltext.extend(traindata)
    alltext.extend(testdata) 
    # print len(alltext)
    trainlabels = np.array(trainlabels).astype(int)           
    testlabels = np.array(testlabels) 
    alltext = np.array(alltext)    
     
    print 'fitting pipeline and transforming data'
    vect = TfidfVectorizer(stop_words='english',min_df=3,max_df=1.0,
                strip_accents='unicode',analyzer='word',ngram_range=(1,2),
                use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LemmaTokenizer()) 
    alltextMatrix = vect.fit_transform(alltext)
    traintext = alltextMatrix[:len(trainlabels)]  
    testtext = alltextMatrix[len(trainlabels):]
 
    print 'applying chi test'
    kf = StratifiedKFold(trainlabels, n_folds=5, indices=True)
    kToTest = [1,3,5,8,10,15,20]
    alphaToTest = [0.0001,0.001,0.01,0.1,0.5,1.0]
    results = np.zeros((len(kToTest),len(alphaToTest)))
    for train,test in kf:
        X_train, X_cv, y_train, y_cv = traintext[train],traintext[test],trainlabels[train],trainlabels[test]
        for i in range(len(kToTest)):
            FS=SelectPercentile(score_func=chi2,percentile=kToTest[i])
            X_FS_train = FS.fit_transform(X_train,y_train)
            X_FS_cv = FS.transform(X_cv)
            for j in range(len(alphaToTest)):
                model = lm.LogisticRegression(penalty='l2', dual=True, tol=alphaToTest[j], 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)
                model.fit(X_FS_train,y_train)
                results[i][j] += metrics.roc_auc_score(y_cv,model.predict_proba(X_FS_cv)[:,1])
 
    k,alpha = np.nonzero(results == results.max())
    # print 'k = %d alpha = %d'%(k[0],alpha[0]) 
    FS=SelectPercentile(score_func=chi2,percentile=kToTest[k[0]])
    X_train = FS.fit_transform(traintext,trainlabels)
    X_test = FS.transform(testtext)
     
    model = lm.LogisticRegression(penalty='l2', dual=True, tol=alphaToTest[alpha[0]], 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)
    print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X_train, trainlabels, cv=20, scoring='roc_auc'))
    model.fit(X_train,trainlabels)
    outputs = model.predict_proba(X_test)[:,1]
        
    final = scipy.vstack((testlabels.T.astype(int),outputs.T.astype(float))).T 
    file_object = csv.writer(open('Solution.csv', "wb"))
    file_object.writerow(['urlid','label'])
    for i in final:
        file_object.writerow(i)
Beispiel #56
0
def buildClassifierWithSplit(classes, examples, featureChoice=None):
	#triggerClassifier = buildTriggerClassifier(positiveTriggerExamples, negativeTriggerExamples)
	
	print "Generating training and test set"
	random.seed(10)
	exampleIDsGroupedByClass = defaultdict(list)
	for i,c in enumerate(classes):
		exampleIDsGroupedByClass[c].append(i)
	
	allClasses = sorted(list(set(classes)))
	trainingIndices = []
	for c in allClasses:
		trainingSetSize = int(0.7 * len(exampleIDsGroupedByClass[c]))
		tmpTrainingIndices = random.sample(exampleIDsGroupedByClass[c], trainingSetSize)
		trainingIndices = trainingIndices + tmpTrainingIndices
	
	testingIndices = [ i for i in range(len(classes)) if not i in trainingIndices ]
	
	trainingClasses = [ classes[i] for i in trainingIndices ]
	trainingExamples = [ examples[i] for i in trainingIndices ]
	testingClasses = [ classes[i] for i in testingIndices ]
	testingExamples = [ examples[i] for i in testingIndices ]
			
	assert len(trainingClasses) == len(trainingExamples)
	assert len(testingClasses) == len(testingExamples)
	
	print "-"*30
	print len(trainingClasses),len(testingClasses)
	for c in allClasses:
		trainingCount = sum ( [ i==c for i in trainingClasses ] )
		testingCount = sum ( [ i==c for i in testingClasses ] )
		print c, trainingCount,testingCount
	for c in allClasses:
		assert c in trainingClasses, 'Class %d should be represented in training set (but is not). Need more data!' % c
		assert c in testingClasses, 'Class %d should be represented in testing set (but is not). Need more data!' % c
			
	print "Training set: %d examples (%d positive and %d negative)" % (len(trainingClasses),sum(trainingClasses),len(trainingClasses)-sum(trainingClasses))
	print "Testing set: %d examples (%d positive and %d negative)" % (len(testingClasses),sum(testingClasses),len(testingClasses)-sum(testingClasses))
	
	print "Starting vectorizer..."
	vectorizer = Vectorizer(trainingClasses,trainingExamples, featureChoice)
	trainingVectors = vectorizer.getTrainingVectors()
	print "Training vectors of size:", trainingVectors.shape
	
	#print trainingClasses
	#print trainingVectors
	#for c,example in zip(trainingClasses,trainingExamples):
	#	s = [ example.sentences[sentenceid].tokens[loc] for sentenceid,locs in example.arguments for loc in locs ]
	#	print ",".join(s)

	perc = 10
	weight = 10
	doTrim = False

	if doTrim:
		print "Trimming training vectors..."
		from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2
		#featureSelector = SelectKBest(chi2, k=100)
		featureSelector = SelectPercentile(chi2,perc)
		trainingVectorsTrimmed = featureSelector.fit_transform(trainingVectors, trainingClasses)
		print "Trimmed training vectors of size:", trainingVectorsTrimmed.shape
	else:
		trainingVectorsTrimmed = trainingVectors
		featureSelector = None

	print "Creating SVM..."
	clf = svm.SVC(kernel='linear', class_weight={1: weight})
	#clf = svm.SVC()

	#print "Creating MultinomialNB..."
	#from sklearn.naive_bayes import MultinomialNB
	#clf = MultinomialNB(fit_prior=True)
	
	print "Fitting classifier to training data..."
	clf.fit(trainingVectorsTrimmed, trainingClasses)
	
	print "Vectorizing test data..."
	testingVectors = vectorizer.vectorize(testingExamples)

	if doTrim:
		print "Trimming test data..."
		testingVectorsTrimmed = featureSelector.transform(testingVectors)
	else:
		testingVectorsTrimmed = testingVectors
	
	print "Making predictions on test data..."
	predictions = clf.predict(testingVectorsTrimmed)
	
	print "perc=%d, weight=%d, doTrim=%s" % (perc,weight,doTrim)

	from sklearn.metrics import confusion_matrix
	print confusion_matrix(testingClasses, predictions)
	
	from sklearn.metrics import f1_score,precision_score,recall_score,classification_report
	print "f1_score: ", f1_score(testingClasses, predictions)
	print "precision_score: ", precision_score(testingClasses, predictions)
	print "recall_score: ", recall_score(testingClasses, predictions)
	#print classification_report(testingClasses,predictions)
	
	print "Fitting to entire training dataset"
	clf = svm.SVC(kernel='linear', class_weight={1: weight})
	clf.fit(vstack([trainingVectorsTrimmed,testingVectorsTrimmed]), trainingClasses+testingClasses)
	
	return vectorizer,featureSelector,clf
def build_linear_model(X, y):
	select = SelectPercentile(score_func=chi2, percentile=20)
	clf = SVC(C=10.0, kernel='linear', probability=True)

	X = select.fit_transform(X, y)
	return (clf.fit(X, y), select)
Beispiel #58
0
from sklearn.feature_selection import SelectPercentile, f_classif

####################################
# load data from from kaggle files - csv
np_train = np.genfromtxt('data/train.csv', delimiter=',', skip_header= True, dtype='uint8')
#np_test = np.genfromtxt('data/test.csv', delimiter=',', skip_header= True, dtype='uint8')


n=6
skf = StratifiedKFold(np_train[:,0].ravel(), n_folds=3, random_state=3476)
predictions=np.zeros_like(np_train[:,0])
for train_index, test_index in skf:
    print time.ctime()

    # a bit of feature selection
    fscore = SelectPercentile(f_classif, percentile = 50)
    Xtrain = np.copy(fscore.fit_transform(np_train[train_index, 1:], np_train[train_index, 0]))
    Xtest = np.copy(fscore.transform(np_train[test_index, 1:]))

    # define knn
    knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', metric='cosine', algorithm='brute')
    knn_clf.fit(Xtrain, np_train[train_index, 0])

    print('here')
    # fitting - this takes long - cca 15min
    predictions[test_index] = knn_clf.predict(Xtest)
    accu = accuracy_score(np_train[test_index, 0], predictions[test_index])
    print("the accuracy of kNN is : %f" % accu)

accu = accuracy_score(np_train[:, 0], predictions)
print("the total valudation accuracy of kNN is : %f" % accu)
Beispiel #59
0
    fileName = r'\trainingSetFeatures.csv'

    # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    filePath = str(input('Input DIRRectory containing TrainingData csv '))

    ## features, labels, lb_encoder,featureNames = load_data(filename, 'file')
    features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file')

    X, y = features, labels
    print('len(set(y)',len(set(y)))
    print(X.shape,"X = samples, features")
    scale = StandardScaler(copy=False)
    X = scale.fit_transform(X)

    FD = SelectFdr(alpha=0.0005)
    FD_K = SelectPercentile(percentile=70)
    X = FD.fit_transform(X,y)
    print(X.shape,"X post FDR alpha filter")
    X_FD = FD_K.fit_transform(X,y)
    print(X_FD.shape,"X post FDR+K-best alpha filter")

    print("\n BASE X models: \n")
    ModelParam_GridSearch(X,y,cv=Kcv)
    '''
    pca = PCA(n_components='mle')
    X_PCA = pca.fit_transform(X)
    print(X_PCA.shape,"X - PCA,mle")
    ModelParam_GridSearch(X_PCA,y,cv=Kcv)
    '''