Example #1
0
def RFE_selector(estimator, n_features_to_select, X_data, Y_data):
    columns = X_data.columns
    selector = RFE(estimator = estimator, n_features_to_select = n_features_to_select)
    selector.fit_transform(X_data, Y_data)
    labels = [columns[x] for x in selector.get_support(indices=True)]    
    feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels)
    return feature
Example #2
0
def ref1(X, y, features_name):
    estimator = LinearSVC(random_state=1)
    selector = RFE(estimator=estimator, n_features_to_select=1)
    selector.fit_transform(X, y)
    result1 = sorted(
        zip(map(lambda x: round(x, 4), selector.ranking_), features_name[:]))
    return [x[1] for x in result1]
Example #3
0
    def rfe(self, frame):
        if frame[self.class_col].dtype == "object":
            frame[self.class_col] = frame[self.class_col].astype('category')
            frame[self.class_col] = frame[self.class_col].cat.codes

        Y = frame[[self.class_col]]
        Y = Y.fillna(0)
        X = frame.drop(columns=[self.class_col])
        nof_list = np.arange(1, len(X.columns) + 1)
        high_score = 0
        nof = 0
        score_list = []
        for n in range(len(nof_list)):
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.3,
                                                                random_state=0)
            model = LinearRegression()
            rfe = RFE(model, n_features_to_select=nof_list[n])
            X_train_rfe = rfe.fit_transform(X_train, y_train.values.ravel())
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe, y_train)
            score = model.score(X_test_rfe, y_test)
            score_list.append(score)
            if score > high_score:
                high_score = score
                nof = nof_list[n]
        cols = list(X.columns)
        model = LinearRegression()
        rfe = RFE(model, n_features_to_select=nof)
        X_rfe = rfe.fit_transform(X, Y.values.ravel())
        model.fit(X_rfe, Y.values.ravel())
        temp = pd.Series(rfe.support_, index=cols)
        selected_features_rfe = list(temp[temp == True].index)
        return selected_features_rfe
Example #4
0
    def Recursive_Feature_Elimination(self, X_train, X_test, y_train, y_test, x, y, file_name = 'model.sav'):
        nof_list = np.arange(1, len(x.columns))
        high_score=0
        nof=0
        score_list =[]
        for n in range(len(nof_list)):
            model = LinearRegression()
            rfe = RFE(model, nof_list[n])
            X_train_rfe = rfe.fit_transform(X_train, y_train)
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe, y_train)
            score = model.score(X_test_rfe, y_test)
            score_list.append(score)
            if(score>high_score):
                high_score = score
                nof = nof_list[n]

        print("Optimum number of features: %d with score: %f" % (nof, high_score))

        cols = list(x.columns)
        model = LinearRegression()
        rfe = RFE(model, nof)
        X_rfe = rfe.fit_transform(x,y)
        model.fit(X_rfe,y)
        temp = pd.Series(rfe.support_,index = cols)
        selected_features_rfe = temp[temp==True].index
        pickle.dump(model, open(file_name, 'wb'))

        with open('parameters_selection.txt', 'w') as f:
            for item in selected_features_rfe:
                f.write("%s\n" % item)

        return selected_features_rfe
Example #5
0
def clustering_rfp(cluster_range, RFE_component_diabetes, dataset, dir):
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp
    NN_RFE_accuracy = defaultdict(dict)
    estimator = SVR(kernel="linear")
    kmeans_accuracy_RFE = defaultdict(dict)
    kmeans_time_RFE = defaultdict(dict)
    em_accuracy_RFE = defaultdict(dict)
    em_time_RFE = defaultdict(dict)
    for RFE_comp in RFE_component_diabetes:

        diabetes_data_RFE = RFE(estimator, n_features_to_select=RFE_comp)
        diabetes_data_RFE_data = diabetes_data_RFE.fit_transform(x, y)
        diabetes_data_RFE_df = pd.DataFrame(data=diabetes_data_RFE_data)

        diabetes_rp = RFE(estimator, n_features_to_select=RFE_comp)
        x_rp = diabetes_rp.fit_transform(x, y)

        diabetes_dataset_rp = dataset
        diabetes_dataset_rp.x = x_rp
        diabetes_dataset_rp.y = y

        for cluster in cluster_range:
            # Kmean
            start = datetime.now()
            myk_mean_RFE_prediction = KMeans(
                n_clusters=cluster,
                random_state=0).fit_predict(diabetes_data_RFE_df)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, myk_mean_RFE_prediction)
            end = datetime.now()

            kmeans_accuracy_RFE[RFE_comp][cluster] = kmeans_accuracy_for_k
            kmeans_time_RFE[RFE_comp][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(n_components=cluster).fit(
                diabetes_data_RFE_df).predict(diabetes_data_RFE_df)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, em_pca_prediction_y)
            end = datetime.now()

            em_accuracy_RFE[RFE_comp][cluster] = em_pca_accuracy_for_k
            em_time_RFE[RFE_comp][cluster] = (end - start).total_seconds()

        NN_RFE_accuracy[RFE_comp] = nn_experiment(diabetes_dataset_rp)
    common_utils.plot_feature_transformation_time(
        kmeans_time_RFE, "k-means RFE clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(
        kmeans_accuracy_RFE, "k-means RFE clusters vs accuracy", dir)
    common_utils.plot_feature_transformation_time(em_time_RFE,
                                                  "EM RFE clusters vs time",
                                                  dir)
    common_utils.plot_feature_transformation_accuracy(
        em_accuracy_RFE, "EM RFE clusters vs accuracy", dir)
Example #6
0
def RFE():
    from sklearn.feature_selection import RFE
    model = LinearRegression(X, y)
    #Initializing RFE model
    rfe = RFE(model, 5)
    #Transforming data using RFE
    X_rfe = rfe.fit_transform(X, y)
    #Fitting the data to model
    model.fit(X_rfe, y)
    print(rfe.support_)
    print(rfe.ranking_)

    #no of features
    nof_list = np.arange(1, 13)
    high_score = 0
    #Variable to store the optimum features
    nof = 0
    score_list = []
    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=100)
        model = LinearRegression()
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    print("Optimum number of features: %d" % nof)
    print("Score with %d features: %f" % (nof, high_score))
def rfe(predictors, target, number_of_features):
    '''
    This function takes in predictors(features), a target variable and the number of top features we want 
    and returns the top features that lead to the best performing linear regression model. 
    '''
    #Initialize the linear regression object
    lm = LinearRegression()

    #Initialize the RFE object,
    #setting the hyperparameters to be our linear regression
    #(as the algorithm to test the features on)
    #and the number of features to be returned
    rfe = RFE(lm, number_of_features)

    #Fit the RFE object to our data.
    #(This means create multiple linear regression models,
    #find the one that performs best,
    #and identify the predictors that are used in that model.
    #Those are the features we want.)
    #Transform our X dataframe to include only
    #the 'number_of_features' that performed the best
    rfe.fit_transform(predictors, target)

    #Create a mask to hold a list of the features that were selected or not
    mask = rfe.support_

    #We get a list of the feature names selected from
    #X_train using .loc with our mask,
    #using .columns to get the column names,
    #and convert the values to a list using .tolist()
    X_reduced_scaled_rfe = predictors.iloc[:, mask].columns.tolist()

    return X_reduced_scaled_rfe
Example #8
0
def FeatureSelectTune():
    f_size = len(features_list)
    results = []

    for i in range(f_size-1, 1, -1):
        print(i)
        clf = AdaBoostClassifier(learning_rate=0.3,
                                 n_estimators=100,
                                 random_state=22)
        scan = RFE(estimator=clf, n_features_to_select=i)
        scan.fit_transform(features_train, labels_train)

        new_feature_zip = zip(features_list[1:], scan.ranking_)
        new_features_list = ['poi'] + \
                            [tup[0] for tup in new_feature_zip if tup[1] == 1]

        pprint.pprint(new_features_list)
        clf = scan.estimator_
        pred = test_classifier(clf, my_dataset, new_features_list)
        if pred:
            results.append((i, pred[2]))

    if new_feature_zip:
        pprint.pprint(new_feature_zip)
    pprint.pprint(results)
    def rfe_function(self, df):
        k = self.k

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        lr = Ridge(alpha=100000,
                   fit_intercept=True,
                   normalize=True,
                   copy_X=True,
                   max_iter=1500,
                   tol=1e-4,
                   solver='auto')
        rfe = RFE(estimator=lr, n_features_to_select=k)
        rfe.fit_transform(X, y)
        ranking = sorted(zip(rfe.ranking_, X.columns.to_list()),
                         reverse=True)[:k]

        co_list = []
        for i in range(k):
            co_list.append(ranking[i][1])

        print('Columns after selections are', co_list)

        return df[co_list]
Example #10
0
def select_rfe(X, y, k):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    rfe.fit_transform(X, y)
    mask = rfe.support_
    rfe_features = X.loc[:, mask].columns.tolist()
    return rfe_features
Example #11
0
def ref5(X, y, features_name):
    estimator = ComplementNB()
    selector = RFE(estimator=estimator, n_features_to_select=1)
    MinMax = MinMaxScaler()
    X = MinMax.fit_transform(X)
    selector.fit_transform(X, y)
    result5 = sorted(
        zip(map(lambda x: round(x, 4), selector.ranking_), features_name[:]))
    return [x[1] for x in result5]
def feature_selection(df, target):
    convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool',
                   'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]',
                   'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category',
                   'all-identical': 'category'}
    ptype = Ptype()
    ptype.run_inference(df)
    predicted = ptype.predicted_types
    count_normal_vars = 0
    count_continuous_vars = 0
    features = []
    for key in predicted:
        # print(key, predicted[key])
        if predicted[key] == 'int' or predicted[key] == 'float':
            features.append(key)
    x = df.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    x = pd.DataFrame(x)
    x.columns = features


    X = x.drop(target, 1)  # Feature Matrix
    y = x[target]  # Target Variable

    # no of features
    nof_list = np.arange(1, len(features))
    high_score = 0
    # Variable to store the optimum features
    nof = 0
    score_list = []
    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        model = LinearRegression()
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    # print("Optimum number of features: %d" % nof)
    # print("Score with %d features: %f" % (nof, high_score))
    cols = list(X.columns)
    model = LinearRegression()
    # Initializing RFE model
    rfe = RFE(model, nof)
    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X, y)
    # Fitting the data to model
    model.fit(X_rfe, y)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index

    quality_measure = nof/len(features)
    return quality_measure
Example #13
0
def extract_feature():
    data = pd.read_excel('fuck1')
    feature = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 18]
    x = data[feature]
    y = data[[1]]
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
    rfe.fit_transform(x, y)
    print(rfe.support_)
    print(rfe.ranking_)
def lin_svc_dir(finC_x, finC_y, finT_x, finT_y):
    
    K = 10
    kf = KFold(n_splits=K)

    best_acc = []

    for num in features_num:

        print('selected num of features: ', num)

        dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split(finC_x, finC_y, test_size=0.1)

        dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split(finT_x, finT_y, test_size=0.1)
    
        estimator_c = LinearSVC()
        selector_c = RFE(estimator_c, num, step=0.1)
        new_x_c = selector_c.fit_transform(dataC_x_train, np.ravel(dataC_y_train))

        estimator_t = LinearSVC()
        selector_t = RFE(estimator_t, num, step=0.1)
        new_x_t = selector_t.fit_transform(dataT_x_train, np.ravel(dataT_y_train))

        new_x = pd.concat([pd.DataFrame(new_x_c), pd.DataFrame(new_x_t)], axis = 1)

        cv_accur = 0
        cv_sd = 0

        accur_total = 0
        accur_list = []

        for train_index, test_index in kf.split(new_x):
            data_x_train, data_x_test = new_x.values[train_index], new_x.values[test_index]
            data_y_train, data_y_test = finC_y.values[train_index], finC_y.values[test_index]
            data_y_train = np.ravel(data_y_train)
            data_y_test = np.ravel(data_y_test)

            accur = np.zeros(num_costs)
            
            for i in range(num_costs):
                model = LinearSVC(C = cost_range[i])
                model.fit(data_x_train, data_y_train)
                pred = model.predict(data_x_test)
                accur[i] = accuracy_score(data_y_test, pred)
                
            accur_total += np.max(accur)
            accur_list.append(np.max(accur))
            
        cv_accur = accur_total/K
        cv_sd = np.std(accur_list)

        print('Accuracy = ', cv_accur, 'std = ', cv_sd)
    
    best_acc.append(cv_accur)
    
    return best_acc
 def rfe_select(self):
     # RFE循环特征选取
     svc = LinearSVC()    # 用线性核SVC也可以用其它线性分类器,若对于回归问题需要采用回归器
     rfe = RFE(estimator=svc, n_features_to_select=self.select_feature_num)
     rfe.fit_transform(self.X_std, self.y)
     features = dict(zip(self.feature_names, rfe.ranking_))
     # 或者可以通过 rfe.get_support()直接返回选择后的特征
     # print(features)
     features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[:self.select_feature_num]
     return features
Example #16
0
def ref_(file):
    start1 = time.time()
    dataset = pd.read_csv(file,engine='python').dropna(axis=1)
    features_name = dataset.columns.values.tolist()
    dataset = np.array(dataset)
    X = dataset[:, 1:]
    y = dataset[:, 0]
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    # estimator = LinearSVC(random_state=1)
    # selector = RFE(estimator=estimator, n_features_to_select=1)
    # selector.fit(X, y)
    #
    # #print(list(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])))
    # #result = sorted(result, key=lambda x: x[1], reverse=True)
    # result1 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))
    #
    # print(time.time()-start1)
    # start1 = time.time()
    # estimator = LogisticRegression(random_state=1)
    # selector = RFE(estimator=estimator, n_features_to_select=1)
    # selector.fit_transform(X, y)
    # result2 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))
    # print(time.time()-start1)
    # start1 = time.time()

    # estimator = RandomForestClassifier(random_state=1)
    # selector = RFE(estimator=estimator, n_features_to_select=1)
    # selector.fit_transform(X, y)
    # result3 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))
    # print(time.time()-start1)
    # start1 = time.time()

    # estimator = GradientBoostingClassifier(random_state=1)
    # selector = RFE(estimator=estimator, n_features_to_select=1)
    # selector.fit_transform(X, y)
    # result4= sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))
    # print(time.time()-start1)
    # start1 = time.time()
    result1 = []
    result2 = []
    result3 = []
    result4 = []
    estimator = ComplementNB()
    selector = RFE(estimator=estimator, n_features_to_select=1)
    selector.fit_transform(X, y)
    result5 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))
    #print(time.time()-start1)
 

    return ([x[1] for x in result1],
            [x[1] for x in result2],
            [x[1] for x in result3],
            [x[1] for x in result4],
            [x[1] for x in result5],)
Example #17
0
def choice_feature_nums(data_x, data_y, col_name):
    n = len(col_name)
    dic = {}
    for i in range(3, n + 1):
        rfe = RFE(estimator=LinearRegression, n_features_to_select=i)
        rfe.fit_transform(data_x, data_y)
        dic[i] = rfe.score()
    plt.xlabel('feature_num')
    plt.ylabel('score')
    plt.plot(dic.keys(), dic.values())
    plt.show()
    return dic
    def fit(self, X, y):
        '''
        Inputs:
        -------
        X: a dataframe
        y: a series
        '''
        # model = LinearRegression()
        # #Initializing RFE model
        # rfe = RFE(model, 7)
        # #Transforming data using RFE
        # X_rfe = rfe.fit_transform(X, y)  
        # #Fitting the data to model
        # model.fit(X_rfe,y)

        # no of features
        nof_list=np.arange(1, X.shape[1])            
        high_score = 0
        #Variable to store the optimum features
        nof = 0           
        score_list = []

        for n in range(len(nof_list)):
            X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=self.test_size)
            model = LinearRegression()
            rfe = RFE(model,nof_list[n])
            X_train_rfe = rfe.fit_transform(X_train,y_train)
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe,y_train)
            score = model.score(X_test_rfe,y_test)
            score_list.append(score)
            if(score>high_score):
                high_score = score
                nof = nof_list[n]

        print("Optimum number of features: %d" %nof)
        print("Score with %d features: %f" % (nof, high_score))

        cols = list(X.columns)
        model = LinearRegression()
        #Initializing RFE model
        rfe = RFE(model, nof)             
        #Transforming data using RFE
        X_rfe = rfe.fit_transform(X,y)  
        #Fitting the data to model
        model.fit(X_rfe,y)              
        temp = pd.Series(rfe.support_,index = cols)
        selected_features_rfe = temp[temp==True].index
        
        self.relevant_features = selected_features_rfe.values

        pass
def rfe_function(data,y_col,k):
    
    X = data[data.columns.difference([y_col])]
    y = data[y_col].astype('float')
    lr = Ridge(alpha=100000, fit_intercept=True, normalize=True, copy_X=True, max_iter=1500, tol=1e-4, solver='auto')
    rfe = RFE(estimator=lr, n_features_to_select=k)
    rfe.fit_transform(X, y)
    ranking = sorted(zip(rfe.ranking_,X.columns.to_list()), reverse=True)[:k]
    
    co_list = []
    for i in range(k):
        co_list.append(ranking[i][1])
    return co_list
def get_selected_features(data, target, method="rfe",
                          n_components=5, threshold=0.1):

    if method == "rfe":
        estimator = DecisionTreeClassifier()
        selector = RFE(estimator, n_components, step=1)
        result = selector.fit_transform(data, target)
    elif method == "vt":
        selector = VarianceThreshold(threshold)
        result = selector.fit_transform(data)
    else:
        result = SelectKBest(chi2, k=n_components).fit_transform(data, target)

    return pd.DataFrame(result)
Example #21
0
def mlp_dir2(finC_x, finC_y, finT_x, finT_y):

    K = 10
    kf = KFold(n_splits=K)

    accur = []

    for num in features_num:

        dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split(
            finC_x, finC_y, test_size=0.1)

        dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split(
            finT_x, finT_y, test_size=0.1)

        estimator_c = LinearSVC()
        selector_c = RFE(estimator_c, num, step=0.1)
        new_x_c = selector_c.fit_transform(dataC_x_train,
                                           np.ravel(dataC_y_train))

        estimator_t = LinearSVC()
        selector_t = RFE(estimator_t, num, step=0.1)
        new_x_t = selector_t.fit_transform(dataT_x_train,
                                           np.ravel(dataT_y_train))

        new_x = pd.concat([pd.DataFrame(new_x_c),
                           pd.DataFrame(new_x_t)],
                          axis=1)

        print('selected num of features: ', num)

        data_x_train, data_x_test, data_y_train, data_y_test = train_test_split(
            new_x, dataC_y_train, test_size=0.1)

        mlp = MLPClassifier(hidden_layer_sizes=(60, ),
                            activation='logistic',
                            solver='lbfgs',
                            learning_rate_init=0.0001,
                            max_iter=1500,
                            alpha=0.001)
        mlp.fit(data_x_train, np.ravel(data_y_train))
        y_pred = mlp.predict(data_x_test)
        accur.append(accuracy_score(data_y_test, y_pred))
        #print('Accuracy: ', accuracy_score(data_y_test, y_pred), 'Loss: ', mlp.loss_)
        #print(confusion_matrix(data_y_test, y_pred))
        print(classification_report(data_y_test, y_pred))

    return accur
Example #22
0
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data 
Example #23
0
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    lg_regression = linear_model.LogisticRegression(solver='lbfgs')
    rfe = RFE(lg_regression, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    lg_regression.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict probs
    test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
    test_y_predict_prob = test_y_predict_probs[:, 1]
    prob_df = pd.DataFrame(test_y_predict_prob)
    prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
    get_accuracy("logistic regression predict_probs", test_y,
                 prob_df['predict'], labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(lg_regression, root_folder + "lg_regression.pkl")
    save_print("lg_regression Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl")
    save_print("lg_regression models columns dumped!")
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    dc_tree = DecisionTreeClassifier(criterion='entropy',
                                     min_samples_split=20,
                                     random_state=99)
    rfe = RFE(dc_tree, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    dc_tree.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict
    test_y_predict = dc_tree.predict(rfe_test_x)
    get_accuracy("decision tree", test_y, test_y_predict, labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(dc_tree, root_folder + "dc_tree.pkl")
    save_print("dc_tree Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl")
    save_print("dc_tree models columns dumped!")
Example #25
0
def optimal_number_of_features(X_train, y_train, X_test, y_test):
    '''
    optimal_number_of_features(X_train, y_train, X_test, y_test)
    RETURNS: number_of_features
    
    discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature
    elimination and linear regression (to test the performance with each number of features).
    We will use the output of this function (the number of features) as input to the next function
    optimal_features, which will then run recursive feature elimination to find the n best features

    Shamelessly stolen from David Espinola
    '''

    number_of_attributes = X_train.shape[1]
    number_of_features_list = np.arange(1, number_of_attributes)
    high_score = 0

    #Variable to store the optimum features
    number_of_features = 0
    score_list = []

    for n in range(len(number_of_features_list)):
        model = LinearRegression()
        rfe = RFE(model, number_of_features_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            number_of_features = number_of_features_list[n]
    return number_of_features
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        dc_tree = DecisionTreeClassifier(criterion='entropy',
                                         min_samples_split=20,
                                         random_state=99)
        rfe = RFE(dc_tree, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        dc_tree.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict
        test_y_predict = dc_tree.predict(rfe_test_x)
        class_1_precision, class_1_recall = get_accuracy(
            "decision tree", test_y, test_y_predict, labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'decision tree')
Example #27
0
def optimal_features(X_train, y_train, number_of_features):
    '''
    optimal_features(X_train, y_train, number_of_features)
    RETURNS: selected_features_rfe
    
    Taking the output of optimal_number_of_features, as n, and use that value to 
    run recursive feature elimination to find the n best features
    
    Shamelessly stolen from David Espinola
    '''

    cols = list(X_train.columns)
    model = LinearRegression()

    #Initializing RFE model
    rfe = RFE(model, number_of_features)

    #Transforming data using RFE
    X_rfe = rfe.fit_transform(X_train, y_train)

    #Fitting the data to model
    model.fit(X_rfe, y_train)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index

    return selected_features_rfe
Example #28
0
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        lg_regression = linear_model.LogisticRegression(solver='lbfgs')
        rfe = RFE(lg_regression, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        lg_regression.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict probs
        test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
        test_y_predict_prob = test_y_predict_probs[:, 1]
        prob_df = pd.DataFrame(test_y_predict_prob)
        prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
        class_1_precision, class_1_recall = get_accuracy(
            "logistic regression predict_probs", test_y, prob_df['predict'],
            labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'logistic regression')
Example #29
0
    def selectKBestFeatures( num_of_features, features_array, class_assignment):

        features_array = np.array(features_array)
        features_array = features_array.astype(float)
        class_assignment = np.array(class_assignment)
        class_assignment = class_assignment.astype(float)

        estimator = SVR(kernel="linear")
        selectorRFE = RFE(estimator, num_of_features, step=1)
        selected_features = selectorRFE.fit_transform(features_array, class_assignment)


        # selectorCHI = SelectKBest(chi2, k=num_of_features)
        # selected_features = selectorCHI.fit_transform(features_array, class_assignment)



        print("first sel", selectorRFE)
        print("RFE selector", len(selected_features[0]))
        print("selected features", selectorRFE.ranking_)




        return selected_features
Example #30
0
def optimal_number_of_features(X, y):
    '''discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature
    elimination and linear regression (to test the performance with each number of features).
    We will use the output of this function (the number of features) as input to the next function
    optimal_features, which will then run recursive feature elimination to find the n best features
    '''
    number_of_attributes = X_train.shape[1]
    number_of_features_list = np.arange(
        1, number_of_attributes)  # len(features_range)

    # set "high score" to be the lowest possible score
    high_score = 0

    # variables to store the feature list and number of features
    number_of_features = 0
    score_list = []

    for n in range(len(number_of_features_list)):
        model = LinearRegression()
        rfe = RFE(model, number_of_features_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            number_of_features = number_of_features_list[n]
    return number_of_features
Example #31
0
def RFE_nof(df, target, normalize):

    y = df[target]
    X = df.drop(target, 1)

    nof_list = np.arange(1, len(X.columns))
    high_score = 0
    #Variable to store the optimum features
    nof = 0
    score_list = []

    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)
        model = LinearRegression(copy_X=True,
                                 fit_intercept=True,
                                 n_jobs=None,
                                 normalize=normalize)
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    return nof
Example #32
0
class LogReg:

  """
  Initialization sets the objects model, vectorizer, labels, and corpus
  variables. Initialization also performs the initial training for the model
  and vectorizer using the given reviews.
  """
  def __init__(
      self,
      reviews,
      vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1,
        ngram_range = (1, 2)),
      model = LogisticRegression()
      ):
    self.model = model
    self.vectorizer = vectorizer
    self.selector = RFE(self.model, step = 100, verbose = 100)

    corpus = []
    labels = []
    for review in reviews:
      corpus += [review[1]["text"]]
      labels += [review[0]]

    #setting variables for the object
    self.corpus = corpus
    self.labels = labels
    self.reviews = reviews

    X = self.vectorizer.fit_transform(self.corpus)
    self.feature_names = self.vectorizer.get_feature_names()
    y = self.labels
    for string in self.feature_names:
      print(string.encode("ascii", 'ignore'))

    #Training the model
    X_new = self.selector.fit_transform(X, self.labels)
    self.model.fit(X_new, self.labels)

  def classify_all(self, all_test_data):
    test_corpus = []
    y = []
    for review in all_test_data:
      test_corpus += [review[1]['text']]
      y += [review[0]]

    #Used transform instead of fit_transform
    #for test data so number of features will match
    X = self.vectorizer.transform(test_corpus)
    X_new = self.selector.transform(X)
    results = self.model.predict(X_new)
    categories = ["spring", "summer", "fall", "winter"]
    for i, category in enumerate(categories):
      top10 = np.argsort(self.model.coef_[i])[-20:]
      for j in top10:
        print("%s: %s" % (category, "".join(self.feature_names[j])))
    return results
Example #33
0
def feat3(matrix):
	last_column = [row[len(matrix[0])-1] for row in matrix]
	data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1])
	indices = list(range(len(matrix[0])-1))
	new_list = map(operator.itemgetter(*indices), matrix)
	data = np.asarray(new_list) 
	data = data.astype(np.float)
	svc = SVC(kernel="linear", C=1)
	rfe = RFE(estimator=svc, n_features_to_select=5, step=1)
	matrix_new = rfe.fit_transform(data, data_class)
	data_class = np.array([data_class])
	features_selected = np.concatenate((matrix_new,data_class.T),axis=1)
	indices_resultados = rfe.get_support(new_list) 
	features = []	
	for data in indices_resultados:
		features.append(data)
	return features
def train_logistic_regression(
		feats = None, labels = [],
		feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
		cv=5, # Number of folds used in cross-validation
		priorlims=np.arange(.1, 3.1, .1), feature_elim = True): # regularization priors to explore (we expect something around 1)
	# Map the count dictionaries to a sparse feature matrix:
	vectorizer = DictVectorizer(sparse=False)
	feats = vectorizer.fit_transform(feats)
	##### FEATURE SELECTION 
	feat_matrix = feats
	feature_selector = None
	if feature_elim == True:
		feature_selector = RFE(estimator=LogisticRegression(), n_features_to_select=None, step=1, verbose=0)
		feat_matrix = feature_selector.fit_transform(feats, labels)

	##### HYPER-PARAMETER SEARCH
	# Define the basic model to use for parameter search:
	searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1, verbose=1, solver='lbfgs', max_iter=2000)
	# Parameters to grid-search over:
	parameters = {'C':priorlims, 'penalty':['l1', 'l2'], 'multi_class':['multinomial', 'ovr']}  
	# Cross-validation grid search to find the best hyper-parameters:	
	clf = GridSearchCV(searchmod, parameters, cv=cv)
	clf.fit(feat_matrix, labels)
	params = clf.best_params_

	# Establish the model we want using the parameters obtained from the search:
	mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'], multi_class=params['multi_class'], solver='lbfgs', verbose=1, max_iter=200)
	##### ASSESSMENT
	scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")	  
	print 'Best model', mod
	print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1])
	print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

	# TRAIN OUR MODEL:
	mod.fit(feat_matrix, labels)
	# Return the trained model along with the objects we need to
	# featurize test data in a way that aligns with our training
	# matrix:
	return (mod, vectorizer, feature_selector)
def train_NB(
		feats = None, labels = [],
		feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
		cv=5, # Number of folds used in cross-validation
		priorlims=np.arange(.1, 2.0, .5)): # alphas to explore (we expect something around 1)
	# Map the count dictionaries to a sparse feature matrix:
	vectorizer = DictVectorizer(sparse=False)
	feats = vectorizer.fit_transform(feats)
	##### FEATURE SELECTION 
	feat_matrix = feats
	feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0)
	feat_matrix = feature_selector.fit_transform(feats, labels)

	##### HYPER-PARAMETER SEARCH
	# Define the basic model to use for parameter search:
	searchmod = MultinomialNB()
	# Parameters to grid-search over:
	parameters = {'alpha':priorlims}
	# Cross-validation grid search to find the best hyper-parameters:	
	clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1)
	clf.fit(feat_matrix, labels)
	params = clf.best_params_

	# Establish the model we want using the parameters obtained from the search:
	mod = MultinomialNB(alpha=params['alpha'])
	##### ASSESSMENT
	scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")	  
	print 'Best model', mod
	print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1])
	print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

	# TRAIN OUR MODEL:
	mod.fit(feat_matrix, labels)
	# Return the trained model along with the objects we need to
	# featurize test data in a way that aligns with our training
	# matrix:
	return (mod, vectorizer, feature_selector)
def train_DT(
		feats = None, labels = [],
		feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
		cv=5): # Number of folds used in cross-validation
	# Map the count dictionaries to a sparse feature matrix:
	vectorizer = DictVectorizer(sparse=False)
	feats = vectorizer.fit_transform(feats)
	##### FEATURE SELECTION 
	feat_matrix = feats
	feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0)
	feat_matrix = feature_selector.fit_transform(feats, labels)

	##### HYPER-PARAMETER SEARCH
	# Define the basic model to use for parameter search:
	searchmod = DecisionTreeClassifier()
	# Parameters to grid-search over:
	parameters = {'splitter':['best','random'],'max_features':['sqrt',0.25,'log2'],'min_samples_split':[2,5,10]}
	# Cross-validation grid search to find the best hyper-parameters:	
	clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1)
	clf.fit(feat_matrix, labels)
	params = clf.best_params_

	# Establish the model we want using the parameters obtained from the search:
	mod = DecisionTreeClassifier(splitter=params['splitter'],max_features=params['max_features'],min_samples_split=params['min_samples_split'])
	##### ASSESSMENT
	scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")	  
	print 'Best model', mod
	print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1])
	print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

	# TRAIN OUR MODEL:
	mod.fit(feat_matrix, labels)
	# Return the trained model along with the objects we need to
	# featurize test data in a way that aligns with our training
	# matrix:
	return (mod, vectorizer, feature_selector)
Example #37
0
# -*- coding: utf-8 -*-

import pandas
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv')

feature = data[['月份', '季度', '广告费用', '客流量']]

rfe = RFE(
    estimator=LinearRegression(), 
    n_features_to_select=2
)

sFeature = rfe.fit_transform(
    feature, 
    data['销售额']
)

rfe.get_support()

def featureSelection(parameter,numberOfFeatures):
	global l,newTrain,newTest,explained_train_var_ratio,explained_test_var_ratio,label,testFileList
	######################################
	if(parameter.upper()=='ASM'):
		trainFile=trainFileASM
		testFile=testFileASM
	else:
		trainFile=trainFileBYTE
		testFile=testFileBYTE
	####################################
	f=open(trainFile)
	header=f.readline().split(',')
	length=len(header)
	for line in iter(f):
		token=line.split(',')
		l=len(token)
		token=[w.replace('\n','') for w in token]
		if(len(token)>length):
			continue;
		train.append(token[1:len(token)-1])
		label.append(token[-1].replace('\n',''))
	f.close()

	f=open(testFile)
	f.readline()
	for line in iter(f):
		token=line.split(',')
		token=[w.replace('\n','') for w in token]
		testFileList.append(token[0])	
		test.append(token[1:])
		#actualLabel.append(token[-1].replace('\n',''))
	f.close()

	model = LogisticRegression()
	rfe = RFE(model, int(numberOfFeatures))
	X=np.array(train)
	Y=np.array(label)
	newTrain=rfe.fit_transform(X,Y)


	print("New Train : ")
	print(newTrain)
	print("get_support")
	support=rfe.support_
	print(support)

	support=np.array(support)
	indexList=[]
	for i in range(len(support)):
		if support[i]:
			#print(str(i)+",")
			indexList.append(i)
		i+=1

	#print(test)
	#print(support[2])
	for sublist in test:
		tempList=[]
		#print(sublist)
		for index in indexList:
			tempList.append(sublist[index])
			#print(entry)
			#print(str(sublist.index(entry)))
			#if support[int(sublist.index(entry))]==True:
				#tempList.append(str(sublist.index(entry)))
				#print(entry)
		#print(tempList)
		newTest.append(tempList)

	print("New Test")
	print(newTest)

	######################################
	if(parameter.upper()=='ASM'):
		writeTrainFile='RFE_TRAIN.ASM'
		writeTestFile='RFE_TEST.ASM'
		writeTrainLabel='RFE_LABEL.ASM'
		writeTestFileList='RFE_TESTFILELIST.ASM'
	else:
		writeTrainFile='RFE_TRAIN.BYTE'
		writeTestFile='RFE_TEST.BYTE'
		writeTrainLabel='RFE_LABEL.BYTE'
		writeTestFileList='RFE_TESTFILELIST.BYTE'
	####################################
	print("***** Transormed train data *****")
	with open(writeTrainFile, 'w') as fp:
		a = csv.writer(fp, delimiter=',')
		a.writerows(newTrain)

	print("***** Transormed test data *****")
	with open(writeTestFile, 'w') as fp:
		a = csv.writer(fp, delimiter=',')
		a.writerows(newTest)

	print("******* Writting Train Labels *********")
	with open(writeTrainLabel, 'w') as fp:
		a = csv.writer(fp, delimiter=',')
		a.writerows(label)

	print("****** Writting Test File Names ******")
	fp=open(writeTestFileList, 'w')
	for entry in testFileList:
		fp.write(entry+"\n")
	fp.close()
Example #39
0
 def RFE(self,estimator,k):
     X=self.X
     Y=self.Y
     rfe=RFE(estimator,n_features_to_select=k)
     res=rfe.fit_transform(X,Y)
     return rfe,res
def featureSelection():
	global l,newTrain,newTest,explained_train_var_ratio,explained_test_var_ratio,label,testFileList
	f=open(trainFile)
	header=f.readline().split(',')
	length=len(header)
	for line in iter(f):
		token=line.split(',')
		l=len(token)
		token=[w.replace('\n','') for w in token]
		if(len(token)>length):
			continue;
		train.append(token[1:len(token)-1])
		label.append(token[-1].replace('\n',''))
	f.close()

	f=open(testFile)
	f.readline()
	for line in iter(f):
		token=line.split(',')
		token=[w.replace('\n','') for w in token]
		testFileList.append(token[0])	
		test.append(token[1:])
		#actualLabel.append(token[-1].replace('\n',''))
	f.close()

	model = LogisticRegression()
	rfe = RFE(model, 1000)
	X=np.array(train)
	Y=np.array(label)
	newTrain=rfe.fit_transform(X,Y)

	print("New Train : ")
	print(newTrain)
	print("get_support")
	support=rfe.support_
	print(support)

	support=np.array(support)
	indexList=[]
	for i in range(len(support)):
		if support[i]:
			#print(str(i)+",")
			indexList.append(i)
		i+=1

	#print(test)
	#print(support[2])
	for sublist in test:
		tempList=[]
		#print(sublist)
		for index in indexList:
			tempList.append(sublist[index])
			#print(entry)
			#print(str(sublist.index(entry)))
			#if support[int(sublist.index(entry))]==True:
				#tempList.append(str(sublist.index(entry)))
				#print(entry)
		#print(tempList)
		newTest.append(tempList)

	print("New Test")
	print(newTest)
Example #41
0
    def train_classifier(self, 
            src_filename,
            feature_function=None,
            feature_selector=None,#SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
            cv=8, # Number of folds used in cross-validation
            priorlims=np.arange(.1, 4.0, .3), #TODO: these are arbitrary numbers!
            use_rfe = False,
            param_search = False, 
            print_model = False,
            aux_data = None,
            retrain = True): # regularization priors to explore (we expect something around 1)

        if feature_function is None:
            feature_function = self._unigram_ft_fn
        """
        note: this differs from the class implementation in that you pass in the filename to read, not the 
        reader itself.  The advantage of this is that it is less annoying.  The disadvantage is that it's
        less general in the case that you want to use different filetypes.

        @return: a tuple of
                (
                    mod - a trained model capable of prediction,
                    vectorizer - an object to convert a nice Counter to a numeric feature vector,
                    feature_selector - the feature selector to used on the training data/to use on the the test data,
                    feature_function - the function used to featurize the trainging data/to use on the the test data
                )

        meme - a (self replicating,) nongenetic cultural unit

        TODO:
        The following errors arrive from using too many cv folds:
        ValueError: zero-size array to reduction operation maximum which has no identity

        """
        if retrain:
            self.model = None

        reader=util.binarized_transcript_reader(src_filename)
        # Featurize the data:
        feats, labels = self._featurizer(reader=reader, feature_function=feature_function) 
        
        # Map the count dictionaries to a sparse feature matrix:
        vectorizer = DictVectorizer(sparse=True) #TODO this was false in the 224u code. No idea why.

        # X is a list of lists, each of shich have length of about 1000
        X = vectorizer.fit_transform(feats)

        # Define the basic model to use for parameter search:
        searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1, solver = 'lbfgs')
        
        ##### FEATURE SELECTION    
        # (An optional step; not always productive). By default, we select all
        # the features that pass the chi2 test of association with the
        # class labels at p < 0.05. sklearn.feature_selection has other
        # methods that are worth trying. I've seen particularly good results
        # with the model-based methods, which require some changes to the
        # current code.
        feat_matrix = None
        if use_rfe:
            feature_selector = RFE(estimator = searchmod, n_features_to_select=None, step=1, verbose=0)
        if feature_selector:
            feat_matrix = feature_selector.fit_transform(X, labels)
        else:
            feat_matrix = X
        
        if param_search:
            ##### HYPER-PARAMETER SEARCH
            # Parameters to grid-search over:
            parameters = {'C':priorlims, 'penalty':['l1','l2'], 'multi_class': ['ovr', 'multinomial']} 
            # parameters = {'C':priorlims, 'penalty':['l1'], 'multi_class': ['ovr']}  #TODO: actually take the time to search for good params
            # Cross-validation grid search to find the best hyper-parameters:   
         
            clf = GridSearchCV(searchmod, parameters, cv=cv)
            # import pdb;pdb.set_trace()
            print "searching for optimal hyperparameters..." 
            clf.fit(feat_matrix, labels)
            print "whew, done with that grid search"
            params = clf.best_params_
        else:
            """Best model LogisticRegression(C=3.7000000000000006, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)"""
            params = {'C':3.7, 'penalty':'l1', 'multi_class': 'ovr'} 

        # Establish the model we want using the parameters obtained from the search:
        mod = LogisticRegression(fit_intercept=True, 
            intercept_scaling=1, 
            C=params['C'], 
            penalty=params['penalty'], 
            multi_class = params['multi_class'], 
            solver = 'lbfgs')

        ##### ASSESSMENT              
        # Cross-validation of our favored model; for other summaries, use different
        # values for scoring: http://scikit-learn.org/dev/modules/model_evaluation.html
        if print_model:
            scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")       
            print 'Best model', mod
            print '%s features selected out of %s total' % (feat_matrix.shape[1], X.shape[1])
            print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

        # TRAIN OUR MODEL:
        print "training model..."
        mod.fit(feat_matrix, labels)
        print "done with training, yeah"

        # Return the trained model along with the objects we need to
        # featurize test data in a way that aligns with our training
        # matrix:
        self.model = (mod, vectorizer, feature_selector, feature_function)
Example #42
0
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features
#    scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1')
#    print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

     # In[ ]:
    svc = LinearSVC(C=20, penalty='l1', dual=False)
    svc.fit(X, y)
    selected_feature_names = feature_cols[[list(set(np.where(svc.coef_ != 0)[-1]))]]
    X_svm = svc.transform(X)
    print("X_svm L1 transformed:", X_svm.shape)
    X=X_svm


     # In[ ]:

    rfeSelect = RFE(estimator=rf,n_features_to_select=10, step=0.15)
    X_RFE = rfeSelect.fit_transform(X,y)
    print(X_RFE.shape)

    # In[ ]:

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print("RFE_FeatureNames: \n",RFE_FeatureNames)


    # In[ ]:

    "http://stackoverflow.com/questions/21548750/plotting-histograms-against-classes-in-pandas-matplotlib"
    for featName in RFE_FeatureNames:
        df.groupby("class").feature.hist(alpha=0.4)
        df.groupby("classname")[featName].plot(kind='kde')