Example #1
0
def get_user_feature(feature_type,behavior,num_feature=800):
    X_train = get_features(feature_type,behavior)
    index = X_train.index
    # 对X进行降维
    Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type']
    print 'start selectKbest...'
    # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1]))
    percent = 0
    if feature_type == 'cat_id':
        percent = 60
    elif feature_type == 'brand_id':
        percent = 15
    elif feature_type == 'seller_id':
        percent = 20
    select = SelectPercentile(f_classif, percentile=percent)
    select.fit(X_train,Y)
    X_train = select.transform(X_train)

    print 'end select...'
    print 'write %s features to train file' % feature_type
    train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_train,index=index).to_csv(train_feature_file_name)

    # 用同样的列降维对应的测试集数据
    X_test = get_features(feature_type,behavior,is_train=False)
    index = X_test.index
    X_test = select.transform(X_test)
    # 写入文件
    print 'write %s features to test file' % feature_type
    test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_test,index=index).to_csv(test_feature_file_name)
    print 'end....'
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"):
    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Enrique training emails:", sum(labels_train)
    print "no. of Juan training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    selector = SelectPercentile(f_classif, percentile=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
Example #5
0
def selectFeatures(features, labels, features_list):
    '''
    Select features according to the 20th percentile of the highest scores. 
    Return a list of features selected  and a dataframe showing the ranking 
    of each feature related to their p values
    features: numpy array with the features to be used to test sklearn models
    labels: numpy array with the real output 
    features_list: a list of names of each feature
    '''
    #feature selection
    selector = SelectPercentile(f_classif, percentile=20)
    selector.fit(features, labels)
    features_transformed = selector.transform(features)
    #filter names to be returned
    l_rtn = [x for x, t in zip(features_list, 
        list(selector.get_support())) if t]
    # pd.DataFrame(features_transformed, columns = l_labels2).head()
    #calculate scores
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores))))
    df_rtn.columns = ["pValue_Max"]
    df_rtn = df_rtn.sort("pValue_Max", ascending=False)
    # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0])


    return l_rtn, df_rtn
Example #6
0
def select_features(X,y):
    selector = SelectPercentile(f_classif, percentile=10)
    print "fit selector"
    selector.fit(X, y)
    print "transform features"
    X = selector.transform(X)
    return X,selector
Example #7
0
def test(X, y):
       
    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_regression, percentile=20)
    selector.fit(X, y)
    print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result

    # selector = SelectPercentile(f_classif, percentile=10)

    ## <Temporary hack for Lesson 3>
    selector = SelectPercentile(f_classif, percentile=1)

    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #9
0
def selectFeatures(Model, X, y):
    model = Model()
    fsel = SelectPercentile(score_func=f_classif, percentile=5)
    fsel.fit(X, y)
    arr = fsel.get_support()
    print "features: ", np.where(arr == True)
    plt.hist(model.predict(X))
    plt.hist(y)
    plt.show()
Example #10
0
    def getWeights(self):
        # Univariate feature selection with F-test for feature scoring
        # We use the default selection function: the 10% most significant features
        selector = SelectPercentile(f_classif, percentile=10)
        selector.fit(self.X, self.y)
        scores = -np.log10(selector.pvalues_)
        scores /= float(scores.max())

        return scores
def selectFeatures(X, y):
    # feature selection with F-test for feature scoring
    # 10% most significant features
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(X, y)

    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()

    return selector, scores
def eval(ds, testNum, p, splitProportion=0.2):
    #testNum=1
    #splitProportion=0.2
    
    allFeaturesF1=[]
    allFeaturesRecall=[]
    allFeaturesPrecision=[]
    
    featureSelctedF1=[]
    featureSelctedRecall = []
    featureSelctedPrecision = []
    
    for _ in range(testNum):
        tstdata, trndata = ds.splitWithProportion( splitProportion )
        X, Y = labanUtil.fromDStoXY(trndata)
        X_test, Y_test = labanUtil.fromDStoXY(tstdata)
        #localF1s = []
        #localRecalls = []
        #localPercisions = []
        for y, y_test in zip(Y, Y_test):
            if all(v == 0 for v in y):
                continue
            #clf = LinearSVC()#fit_intercept=True, C=p)
            #clf.sparsify()
            
            #clf = RandomForestClassifier()#criterion='entropy')
            #clf = tree.DecisionTreeClassifier()#max_depth=p)
            clf = AdaBoostClassifier()
            #clf = GradientBoostingClassifier()#, learning_rate=lr)
            #clf = ExtraTreesClassifier(n_estimators=p)
                        
            #svc = LinearSVC()
            #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2)
            selector = SelectPercentile(chooser, percentile=p)
            
            selector.fit(X, y)
            name = str(clf).split()[0].split('(')[0]
            clf.fit(selector.transform(X), y)
            pred = clf.predict(selector.transform(X_test))
            
            featureSelctedF1.append(metrics.f1_score(y_test, pred))
            featureSelctedRecall.append(metrics.recall_score(y_test, pred))
            featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) 
            
            clf.fit(X, y)
            pred = clf.predict(X_test)
            
            allFeaturesF1.append(metrics.f1_score(y_test, pred))
            allFeaturesRecall.append(metrics.recall_score(y_test, pred))
            allFeaturesPrecision.append(metrics.precision_score(y_test, pred))

    return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \
        np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \
        np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \
        name
Example #13
0
def predict(classifier_type="tree",selection="Univariate", f="1"):

	if (f=="1"):
		kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl"
		p = 1
		BIG_C = 0.001
	if (f=="2"):
		kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl"
		p = 5
		BIG_C = 0.1
	if (f=="3"):
		kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl"
		p = 25
		BIG_C = 2
 	dump_data = False
	kernel_type = "linear"
	(data_matrix, features, samples) = readData()
	x = data_matrix.data
	y = data_matrix.target
	target_names = data_matrix.target_names
	x_indices = np.arange(x.shape[-1])
	(m,n) = x.shape

	test = joblib.load("GS_pickles\imputed_test_data.pkl")
	test_x = np.array(test)
	(i,j) = test_x.shape
	print "Training matrix shape: %s,%s" %(m,n)
	print "Test matrix shape: %s,%s" %(i,j)

	trimmed_x = []
	trimmed_test_x = []

	if (selection=="Univariate"):
		selector = SelectPercentile(f_classif, percentile=p)
		selector.fit(x, y)
		# Trimming the matrix, now should contain x% of the 8650 features
		trimmed_x = selector.transform(x)
		trimmed_test_x = selector.transform(test_x)

	if (selection=="kclusters"):
		kcluster_flist = joblib.load(kc_fn)
		trimmed_x = np.take(x, kcluster_flist, axis=1)
		trimmed_test_x = np.take(test_x, kcluster_flist, axis=1)

	n_samples, n_features = trimmed_x.shape
	# Linear SVM classifier
	if (classifier_type=="SVM"):
		clf = svm.SVC(kernel=kernel_type, degree=3, probability=True)
	# Gaussian Naive Bayes classifier
	if (classifier_type=="NB"):
		clf = GaussianNB()
	clf.fit(trimmed_x,y)

	result = clf.predict(trimmed_test_x)
	return result
def univariate_feature_selection(dataset, features):
	# load the dataset
	spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx')
	data = Data(spreadsheet)
	targets = data.targets


	X = dataset
	y = data.targets


	###############################################################################
	plt.figure(1)
	plt.clf()

	X_indices = np.arange(X.shape[-1])

	###############################################################################
	# Univariate feature selection with F-test for feature scoring
	# We use the default selection function: the 10% most significant features
	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(X, y)
	scores = -np.log10(selector.pvalues_)
	scores /= scores.max()
	plt.bar(X_indices - .45, scores, width=.2,
	        label=r'Univariate score ($-Log(p_{value})$)', color='g')

	###############################################################################
	# Compare to the weights of an SVM
	clf = svm.SVC(kernel='linear')
	clf.fit(X, y)

	svm_weights = (clf.coef_ ** 2).sum(axis=0)
	svm_weights /= svm_weights.max()

	plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r')

	clf_selected = svm.SVC(kernel='linear')
	clf_selected.fit(selector.transform(X), y)

	svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
	svm_weights_selected /= svm_weights_selected.max()

	plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
	        width=.2, label='SVM weights after selection', color='b')


	x = np.arange(0, len(features))
	plt.title("Comparing feature selection")
	plt.xlabel('Feature number')
	plt.xticks(x, features, rotation=45)
	plt.yticks(())
	#plt.axis('tight')
	plt.legend(loc='upper right')
	plt.show()
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    # read a vector of documents from file(decoded) 
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    # read a vector of labels/authors from file(decoded) 
    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()


    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    # features_train,features_test  is a vector of sentences

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test) # no fitting here. So the idf is the one calculated initially
    # returns sparse matrix(N*M) where N = each document/sample, M gives tf*invdf weightage of current feature word in document. 
    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train) # select top k% best features using univariate statistical tests
    features_train_transformed = selector.transform(features_train_transformed).toarray() # select the columns based on the stats test
    features_test_transformed  = selector.transform(features_test_transformed).toarray() # do as above

    ### info on the data
    #print "no. of Chris training emails:", sum(labels_train)
    #print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #16
0
def preprocess(words_file="../tools/word_data.pkl",
               authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features
        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions
        4 objects are returned:
            -- training/testing features
            -- training/testing labels
    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "rb")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "rb")
    word_data = pickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
        word_data, authors, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(
        features_train_transformed).toarray()
    features_test_transformed = selector.transform(
        features_test_transformed).toarray()

    ### info on the data
    print("no. of Chris training emails:", sum(labels_train))
    print("no. of Sara training emails:",
          len(labels_train) - sum(labels_train))

    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #17
0
def features_weight(X, y, ascending=False):
    '''
    Paremeters
    ----------
        X: pd.DataFrame
        y: pd.Series
    '''
    selector = SelectPercentile(f_classif)
    selector.fit(X.values, y.values)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    ans = pd.Series(scores, index=X.columns)
    return ans.order(ascending=ascending)
Example #18
0
    def select_percentile(self, feature_train, label_train, feature_test):
        """
        parameter:
            feature_train: array of shape [n_samples, n_features]
            feature_test: array of shape [n_samples, n_features]
        return:
            array of shape [n_samples, n_selected_features], array of shape [n_samples, n_selected_features]
        """
        selector = SelectPercentile(percentile=self.__context_manager.percentile)
        selector.fit(feature_train, label_train)
        feature_train, feature_test = selector.transform(feature_train).toarray(), selector.transform(feature_test).toarray()

        return feature_train, feature_test
Example #19
0
def select(p, x_train, x_test, y_trian, y_test):
    #copy dataframes
    x_train_selected = x_train.copy()
    x_test_selected = x_test.copy()
    #p: percentage of remaining columns
    select = SelectPercentile(percentile=p)
    select.fit(x_train_selected, y_train)
    x_train_selected = select.transform(x_train_selected)
    x_test_selected = select.transform(x_test_selected)
    #train & test
    lr_selected = skl_lm.LogisticRegression()
    lr_selected.fit(x_train_selected, y_train)
    return (lr_selected.score(x_test_selected, y_test), select.get_support())
Example #20
0
def selectFeatures(features_train,
                   labels_train,
                   features_test,
                   percentile=10,
                   runInfo=None):
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(features_train, labels_train)
    features_train_transformed = selector.transform(features_train)
    features_test_transformed = selector.transform(features_test)
    if runInfo is not None:
        runInfo["Selected Features:"] = "Perc = {}, Num = {}".format(
            percentile, len(features_train_transformed[0]))
    return features_train_transformed, features_test_transformed
Example #21
0
def make_train_test(df_train, df_test):
    vectorizer = CountVectorizer()
    
    X_train = vectorizer.fit_transform(df_train['Phrase'].values)
    Y_train = df_train['Sentiment'].values
    X_test = vectorizer.transform(df_test['Phrase'].values)
    
    selector = SelectPercentile(f_classif, percentile=50)
    selector.fit(X_train, Y_train)
    features_train_transformed = selector.transform(X_train)
    features_test_transformed  = selector.transform(X_test)
    
    return features_train_transformed, Y_train, features_test_transformed
Example #22
0
 def fselect_unstat(self, prec=20):
     """ use p value to exclude variables. prec is precentige of features selceted"""
     from sklearn.feature_selection import SelectPercentile
     select = SelectPercentile(percentile=prec)
     select.fit(self.X1, self.y1)
     self.X1 = select.transform(self.X1)
     self.X2 = select.transform(self.X2)
     if self.X3:
         self.X3 = select.transform(self.X3)
     print("Selected feat using unsata model:",
           self.head[select.get_support()],
           len(self.head[select.get_support()]))
     self.head = self.head[select.get_support()]
Example #23
0
def select_best_perc(selected_percentile, features_train, labels_train):
    ''' Select features with SelectPercentile '''
    select = SelectPercentile(percentile=selected_percentile)
    # Fit data
    select.fit(features_train, labels_train)
    # Get features score
    feature_scores = np.array(select.scores_)
    mask = select.get_support()
    # Transform features and labels
    features_train_selected = select.transform(features_train)
    features_test_selected = select.transform(features_test)

    return mask, features_train_selected, features_test_selected, feature_scores
Example #24
0
def select_percentile():
    x = [[0.6, 2, 3], [2.5, 4, 6], [3.4, 6.2, 9.4]]
    y = [1, 2, 3]
    print(x)

    selector = SelectPercentile(score_func=f_regression, percentile=100)
    selector.fit(x, y)

    print(selector.scores_)
    print(selector.pvalues_)
    print(selector.get_support(True))
    print(selector.transform(x))
    pass
def preprocess(
        main_file="underwriter.csv",
        plan_file="plan_name.csv"):  #include pkl file (if csv not working)
    """ 
        this function takes a pre-made list of plan names (by default underwriter.csv)
        and the corresponding authors (by default plan_name.csv) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    plan_file_handler = open(plan_file, "r")
    plans = pickle.load(plan_file_handler)
    plan_file_handler.close()

    main_file_handler = open(main_file, "r")
    main_data = cPickle.load(main_file_handler)
    main_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
        main_data, plans, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(
        features_train_transformed).toarray()
    features_test_transformed = selector.transform(
        features_test_transformed).toarray()

    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #26
0
def sel_per(X_train, y_train, X_test, y_test):
    sel_per = SelectPercentile(percentile=50)   # use f_classif and 50% percentile
    sel_per.fit(X_train, y_train)
    X_train_selected = sel_per.transform(X_train)  # select X train
    print('X_train_shape: {}'.format(X_train.shape))
    print('X_train_selected.shape: {}'.format(X_train_selected.shape))
    mask = sel_per.get_support()
    X_test_selected = sel_per.transform(X_test)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print('LR score with all features: {:.3f}'.format(lr.score(X_test, y_test)))
    lr.fit(X_train_selected, y_train)
    print('LR score with selected features: {:.3f}'.format(lr.score(X_test_selected, y_test)))
    return mask
 def getFeatures(self, number_of_features=10):
  # X = self.training.iloc[:,:,-1]
  y = self.training['TARGET']
  X = self.training.drop(['TARGET'], axis=1)
  #Select features according to the k highest scores.
  #selectFeatures = SelectBest(chi2, k= number_of_features)
  #Select the best 10 percentile
  # We can use other classifier as well for Selection like chi2
  selectFeatures = SelectPercentile(f_classif, percentile= number_of_features)
  selectFeatures.fit(X, y)
  # X_select = selectFeatures.transform(X)
  features = selectFeatures.get_support(indices=True)
  # print("Best feature: "+ features[0])
  return(features) 
def compute_feature_statistics(train_X, train_Y):
    ''' 
    Univariate Feature Selection - see sklearn:
    http://scikit-learn.org/dev/auto_examples/plot_feature_selection.html#example-plot-feature-selection-py
    
    Features are not removed, only statistics computed
    '''

    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(train_X, train_Y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()

    return scores, selector
def prepareTrainingData():
    # Einlesen der Trainingsdaten und casten in Numpy Array
    labels = np.array(getLabels())
    features = np.array(featureLigands(getLigands(), setAllFeatures()))
    # Skalierung der Trainingsdaten mit MinMaxScaler
    scaler = preprocessing.MinMaxScaler()
    scaled_features = scaler.fit_transform(features)
    # Feature Selection mit 10 Percentil
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(scaled_features, labels)
    # Speichere Selektor um ihn bei INPUT später auch zu verwenden
    joblib.dump(selector, '../selector/selector.pkl')
    selected_features = selector.transform(scaled_features)
    return selected_features, labels
Example #30
0
class FeatureSelection:
    """
    特征选择
    
    percentile:选取特征的百分比
    
    """
    def __init__(self,percentile=70):
        self.percentile=percentile
    def fit(self,x,y):
        self.sepChi=SelectPercentile(score_func=chi2,percentile=self.percentile)#使用卡方
        self.sepChi.fit(x,y)
    def transform(self,x,y):
        return (self.sepChi.transform(x),y)    
def preprocess_4(article_file, lable_file):
    # article_file = "pkl/2013_article.pkl"
    # lable_file = "pkl/2013_lable.pkl"

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, lables, test_size=0.1, random_state=42)

    # print features_train.shape
    # print features_test[0]
    # print features_test.shape


    ### text vectorization--go from strings to lists of numbers
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)

    # print "features_train_transformed is {}".format(features_train_transformed.shape)
    # print "features_test_transformed is {}".format(features_test_transformed.shape)
    # print "vectorizer time:", round(time()-t0, 3), "s"
    # print len(vectorizer.get_feature_names())

    ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    t0 = time()
    selector = SelectPercentile(f_classif, percentile=30)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    # print "features_train_transformed is {}".format(features_train_transformed.shape)
    # print "features_test_transformed is {}".format(features_test_transformed.shape)
    # print "selector time:", round(time()-t0, 3), "s"

    # print len(vectorizer.get_feature_names())
    # print vectorizer.get_feature_names()[0:-10]
    # print len(selector.scores_)

    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #32
0
def preprocesa(features, labels):
    features_train, features_test,labels_train,labels_test=cross_validation.train_test_split(features,labels,test_size=0.1,random_state=42)
    vectorizer = TfidfVectorizer(sublinear_tf = True,  max_df = 0.5, stop_words = "english")
    features_train = vectorizer.fit_transform(features_train)
    features_test = vectorizer.transform(features_test)
    joblib.dump(vectorizer, 'vectorizer.pkl')

    selector = SelectPercentile(f_classif, percentile = 10)
    selector.fit(features_train, labels_train)
    joblib.dump(selector, 'selector.pkl')
    features_test =selector.transform(features_test).toarray()
    features_train = selector.transform(features_train).toarray()

    return features_train, features_test, labels_test, labels_train
def Preprocess(
        words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl",
        labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl"
):
    """ 
        this function takes a pre-made list of data texts (by default word_data.pkl)
        and the corresponding labels (by default label_data.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and labels (positive or negative)
    word_data = pickle.load(open(words_file, "r"))
    labels = pickle.load(open(labels_file, "r"))

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
        word_data, labels, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 encoding='windows-1256')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(
        features_train_transformed).toarray()
    features_test_transformed = selector.transform(
        features_test_transformed).toarray()

    ### info on the data
    print "no. of positive training files:", sum(labels_train)
    print "no. of negative training files:", len(labels_train) - sum(
        labels_train)

    return features_train_transformed, features_test_transformed, labels_train, labels_test
def compute_feature_statistics(train_X, train_Y):
    """ 
    Univariate Feature Selection - see sklearn:
    http://scikit-learn.org/dev/auto_examples/plot_feature_selection.html#example-plot-feature-selection-py
    
    Features are not removed, only statistics computed
    """

    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(train_X, train_Y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()

    return scores, selector
Example #35
0
def construct_features(train_X, train_Y, test_X, have_poly=True):
    
    # find the most important features
    sel = SelectPercentile(f_regression, percentile=20)
    sel.fit(train_X, train_Y)
    sup = sel.get_support()

    sel_idx = np.where(sup == True)[0]
    sel_names = [names[i] for i in sel_idx]
    sel_names = [n for i,n in enumerate(sel_names) if floats[sel_idx[i]] == True]
    
    # feature construction
    d = {}  # training
    d_t = {}  # testing

    # construct features by combining 2 different features
    for i, n1 in enumerate(sel_names):
        for j, n2 in enumerate(sel_names):
            if i != j:
                new_col = train_X[n1] * train_X[n2]
                new_col_t = test_X[n1] * test_X[n2]
                new_name = n1 + '*' + n2
                d[new_name] = new_col
                d_t[new_name] = new_col_t
    comb_X = pandas.DataFrame(data=d)
    comb_X_t = pandas.DataFrame(data=d_t)

    if have_poly is False:
        new_X = train_X.join(comb_X)
        new_test = test_X.join(comb_X_t)
        return new_X, new_test

    # construct features by making polynimial terms
    float_names = [n for i,n in enumerate(names) if floats[i] == True]
    quad_X = train_X[float_names] ** 2
    quad_X_t = test_X[float_names] ** 2
    quad_X.columns = [n + '^2' for n in float_names]
    quad_X_t.columns = [n + '^2' for n in float_names]
    tri_X = train_X[float_names] ** 3
    tri_X_t = test_X[float_names] ** 3
    tri_X.columns = [n + '^3' for n in float_names]
    tri_X_t.columns = [n + '^3' for n in float_names]
    poly_X = quad_X.join(tri_X)
    poly_X_t = quad_X_t.join(tri_X_t)
    comb_X = comb_X.join(poly_X)
    comb_X_t = comb_X_t.join(poly_X_t)
    new_X = train_X.join(comb_X)
    new_test = test_X.join(comb_X_t)
    
    return new_X, new_test
Example #36
0
def test_select_percentile_4():
    """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectPercentile(f_classif, percentile=42)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
Example #37
0
 def test_select_percentile_float(self):
     model = SelectPercentile()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.float32)
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select percentile",
         [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnSelectPercentile")
Example #38
0
def test_select_percentile_4():
    """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectPercentile(f_classif, percentile=42)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
Example #39
0
def percentile_k_features(df, k=20):
    X = df.drop(['SalePrice'], axis=1)
    y = df['SalePrice']
    select_p = SelectPercentile(f_regression, percentile=k)
    select_p.fit(X, y)
    select_p.scores_
    d = dict()
    for n, s in zip(X.columns, select_p.scores_):
        d[n] = s
    sorted_data = sorted(d.items(), key=lambda kv: kv[1], reverse=True)
    features = list()
    features = sorted_data[:7]
    features = [f for (f, v) in features]
    print(features)
    return features
Example #40
0
def feature_selection_regression(data):
  """This function finds the important features using mutual information regression under a percentile value.

  Input:
      data: The dataframe.
  
  Output:
      Returns the top important features under 30 percentiles.
  """
  X=data.drop('How are you feeling right now?',axis=1)
  y=data['How are you feeling right now?']
  select=SelectPercentile(mutual_info_regression,percentile=30)
  select.fit(X,y)

  return X.columns[select.get_support()]
Example #41
0
def feature_selection_linear(data):
  """This function finds the important features using mutual information regression under a percentile value. It is only for Linear_Regression.ipynb.

  Input:
      data: The dataframe.
  
  Output:
      Returns the top important features under 30 percentiles.
  """
  X=data.drop('On a scale of 1-100, how would you express this feeling?',axis=1)
  y=data['On a scale of 1-100, how would you express this feeling?']
  select=SelectPercentile(mutual_info_regression,percentile=30)
  select.fit(X,y)

  return X.columns[select.get_support()]
def trainingPreprocess(words_file, authors_file):
    """
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        6 objects are returned:
            -- training/testing features
            -- training/testing labels
            -- a fitted vectorizer
            -- a fitted selector

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    
    ### feature selection
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    return features_train_transformed, features_test_transformed, labels_train, labels_test, vectorizer, selector
Example #43
0
def main():

	#set the timer
	start = time.time()

	#load the data
	trainX = np.load('trainX.npy')
	testX = np.load('testX.npy')
	trainY = np.load('trainY.npy')
	testY = np.load('testY.npy')
	print('\n!!! Data Loading Completed !!!\n')

	#get the 1st digit zero and plot it
	zero = trainX[14].reshape(28, 28)
	plt.imshow(zero, cmap=cm.Greys_r)
	plt.savefig("original"+str(trainY[14])+".png")
	#plt.show()

	#apply kpca
	kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True)
	kpca.fit(trainX[0:3000])
	trainX_kpca = kpca.transform(trainX)
	testX_kpca = kpca.transform(testX)

	#do inverse transform and plot the result
	orig = kpca.inverse_transform(trainX_kpca)
	img = orig[14].reshape(28, 28)
	plt.imshow(img, cmap=cm.Greys_r)
	plt.savefig("reconstructed"+str(trainY[14])+".png")
	#plt.show()

	selector = SelectPercentile(f_classif, percentile=5)
	selector.fit(trainX_kpca, trainY)
	trainX = selector.transform(trainX_kpca)
	testX = selector.transform(testX_kpca)

	#fit a classifier
	parameters = {'n_neighbors' : list(np.arange(15)+1)}
	clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters)
	clf.fit(trainX, trainY)

	pred = clf.predict(testX)
	print accuracy_score(testY, pred)
	print confusion_matrix(testY, pred)
	#print(clf.best_params_)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def make_dataset(list_of_vocabs, emp_name_abs, df_without_outliers,
				 employees_w_email_dir):
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
    								list_of_vocabs, emp_name_abs, test_size = 0.1)

    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 stop_words='english',
                                max_df = 0.5)
    
    features_train_transformed = vectorizer.fit_transform(X_train)
    features_test_transformed = vectorizer.transform(X_test)
    
    features_names = np.array(vectorizer.get_feature_names())
    
    selector = SelectPercentile(f_classif, percentile = 0.01)
    selector.fit(features_train_transformed, y_train)

    important_features = selector.get_support(indices=False)
    
    scores = selector.scores_

    scores = scores[important_features]
    
    features_train_transformed =\
    	selector.transform(features_train_transformed)

    features_test_transformed  =\
    	selector.transform(features_test_transformed)

    features_train_transformed = features_train_transformed.toarray()
    features_test_transformed = features_test_transformed.toarray()

    features = np.concatenate((features_train_transformed,
    						   features_test_transformed))
    labels = np.concatenate((y_train, y_test))
    
    scaler = preprocessing.MinMaxScaler()
    rescaled_weight = scaler.fit_transform(features)
    
    features_of_interest = features_names[important_features]

    f_length = len(features_of_interest)

    scores_report =\
	{features_of_interest[i]:scores[i] for i in xrange(f_length)}
    
    return features, labels, features_of_interest, scores_report
Example #45
0
 def get_feature_args(self, x, y, percentile=80, k=40):
     if self.feature_selection == 'info':
         info_score = mutual_info_classif(x, y)
         self.features_to_use = np.argwhere(info_score > 0).ravel()
         if len(self.features_to_use) <= 1:
             self.features_to_use = np.argwhere(x.std(axis=0) > 0).ravel()
     elif self.feature_selection == 'percentile':
         selector = SelectPercentile(percentile=percentile)
         selector.fit(x, y)
         self.features_to_use = np.argwhere(selector.get_support()).ravel()
     elif self.feature_selection == 'kbest':
         k = np.min([int(np.ceil(percentile * x.shape[1] / 100)), k])
         selector = SelectKBest(k=k).fit(x, y)
         self.features_to_use = np.argwhere(selector.get_support()).ravel()
     else:
         self.features_to_use = np.argwhere(x.std(axis=0) > 0).ravel()
def test_select_percentile_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the percentile heuristic
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
Example #47
0
 def test_select_percentile_float(self):
     model = SelectPercentile()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]])
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select percentile',
         [('input', FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectPercentile",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def test_select_percentile_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
Example #50
0
def automatic_feature_selection_select_percentile(data):
    #print data
    x_train, x_test, y_train, y_test = train_test_split(data.drop(
        ['target', 'bi_gram_score', 'type', 'source', 'curID', 'rawClue'],
        axis=1),
                                                        data.target_label,
                                                        random_state=0,
                                                        test_size=0.5)
    select = SelectPercentile(percentile=50)
    select.fit(x_train, y_train)
    x_train_selected = select.transform(x_train)
    for k, v in enumerate(select.get_support()):
        if v == True:
            print x_train.columns[k]

    print x_train_selected.shape
Example #51
0
def get_top_chi2_candidate_ngrams(queries, f_extractor, percentile):
    """Get top ngrams features according to chi2.
    """
    ngrams_dict = dict()
    features, labels = construct_examples(queries, f_extractor)
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    # ch2 = SelectKBest(chi2, k=n_features)
    ch2 = SelectPercentile(chi2, percentile=percentile)
    ch2.fit(X, labels)
    indices = ch2.get_support(indices=True)
    for i in indices:
        ngrams_dict[vec.feature_names_[i]] = 1
    return ngrams_dict
Example #52
0
def percentile_k_features(data,k=20):


    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]

    X_new = SelectPercentile(f_regression,percentile=20).fit_transform(X,y)
    #s=np.asarray(X_new.get_support())
    s1=SelectPercentile(f_regression,percentile=20)
    s1.fit(X,y)
    s2 = s1.get_support()
    s3=X.columns
    s4= s3[s2]
    s5 = s4.tolist()
    s6 = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath']
    return s6
def _feature_selection(matrix, method="PCA", target=None):
    print("--Selecting features with {} ".format(method))
    target_components = 300
    if method == "PCA":
        from sklearn.decomposition import PCA
        pca = PCA(n_components=target_components)
        reduced_matrix = pca.fit_transform(matrix)
    if method == "SVD":
        from sklearn.decomposition import TruncatedSVD
        lsa = TruncatedSVD(n_components=target_components)
        reduced_matrix = lsa.fit_transform(matrix)
    if method == "SelectKBest":
        if target is None:
            raise Exception("No target found on supervised _feature_selection")
        from sklearn.feature_selection import SelectKBest, chi2
        X, y = matrix, target
        reduced_matrix = SelectKBest(chi2,
                                     k=target_components).fit_transform(X, y)
    if method == "LinearSVC":
        if target is None:
            raise Exception("No target found on supervised _feature_selection")
        from sklearn.svm import LinearSVC
        from sklearn.feature_selection import SelectFromModel
        X, y = matrix, target
        lsvc = LinearSVC(C=0.5, penalty="l1", dual=False).fit(X, y)
        model = SelectFromModel(lsvc, prefit=True)
        reduced_matrix = model.transform(X)
    if method == "SelectPercentile":
        from sklearn.feature_selection import SelectPercentile, f_classif
        X, y = matrix, target
        selector = SelectPercentile(f_classif, percentile=10)
        reduced_matrix = selector.fit(X, y)
    return reduced_matrix
Example #54
0
class Feature_Selection:
    def __init__(self, n_features, problem_type, scoring):
        valid_scoring = dict()
        if problem_stype == 'classification':
            valid_scoring['f_classif'] = f_classif
            valid_scoring['chi2'] = chi2
            valid_scoring['mutual_info_classif'] = mutual_info_classif
        else:
            valid_scoring['f_regression'] = f_regression
            valid_scoring['mutual_info_regression'] = mutual_info_regression

        if scoring not in valid_scoring:
            raise Exception('Invalid Scoring Type')

        if isinstance(n_features, int):
            self.selection = SelectKBest(valid_scoring[scoring], k=n_features)
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(valid_scoring[scoring],
                                              percentile=int(100 * n_features))
        else:
            raise Exception('Invalid Type of Feature')

    def fit(self, x, y):
        return self.selection.fit(X, y)

    def transform(self, X):
        return self.selection.transform(X)

    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)
Example #55
0
def get_top_chi2_candidate_ngrams(queries, f_extractor, percentile):
    """Get top ngrams features according to chi2.
    """
    ngrams_dict = dict()
    features, labels = construct_examples(queries, f_extractor)
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    # ch2 = SelectKBest(chi2, k=n_features)
    ch2 = SelectPercentile(chi2, percentile=percentile)
    ch2.fit(X, labels)
    indices = ch2.get_support(indices=True)
    for i in indices:
        ngrams_dict[vec.feature_names_[i]] = 1
    return ngrams_dict
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_select_percentile_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif):
    df_lst = []
    for fset_name, df in feature_sets.items():
        X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1)
        y_train = df[df.partition == 'train'].fatality_ind
        df_X = df.drop(['partition', 'fatality_ind'], axis=1)
        if fs_fn == 'pct':
            featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile)
        else:
            featureSelector = SelectKBest(score_func=score_fn, k=nFeatures)
        featureSelector.fit(X_train, y_train)
        fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1))
        cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))]
        cols_fs_ref = [fset_name + ' ' + c for c in cols_fs]
        df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref)
        df_lst.append(df_fs)
    df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1))
    return df_comb
Example #60
0
def feature_transform(features_train, features_test, top_percent=1):
    """ Function to apply Bag of Words feature creator with TfIdf statistic 
        normalisation. The input is train and test text, and optional parameter
        'top_percent' which shows how many percent of super high dimensional
        text feature space is to return (defaul is 1%). 
        The output is the transformed train and test feature vectors suitable 
        to use with sklearn classifiers.
    """
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    
    ### Feature selection, because text is super high dimensional
    selector = SelectPercentile(f_classif, percentile=top_percent)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()
    return features_train_transformed, features_test_transformed