def plot_stable_features(X_train,y_train,featnames,**kwargs):
    from sklearn.linear_model import LassoLarsCV,RandomizedLasso

    n_resampling = kwargs.pop('n_resampling',200)
    n_jobs = kwargs.pop('n_jobs',-1)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        # estimate alphas via xvalidation 
        lars_cv = LassoLarsCV(cv=6,n_jobs=n_jobs).fit(X_train,y_train)        
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

        clf = RandomizedLasso(alpha=alphas, random_state=42, n_jobs=n_jobs,
                              n_resampling=n_resampling)
        clf.fit(X_train,y_train)
        importances = clf.scores_ 
        indices = np.argsort(importances)[::-1]

        pl.bar(range(len(featnames)), importances[indices],
               color="r", align="center")
        pl.xticks(np.arange(len(featnames))+0.5,featnames[indices],
                  rotation=45,horizontalalignment='right')
        pl.xlim(-0.5,len(featnames)-0.5)
        pl.subplots_adjust(bottom=0.2)
        
        pl.ylim(0,np.max(importances)*1.01)
        pl.ylabel('Selection frequency (%) for %d resamplings '%n_resampling)
        pl.title("Stability Selection: Selection Frequencies")
def plot_stable_features(X_train, y_train, featnames, **kwargs):
    from sklearn.linear_model import LassoLarsCV, RandomizedLasso

    n_resampling = kwargs.pop('n_resampling', 200)
    n_jobs = kwargs.pop('n_jobs', -1)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        # estimate alphas via xvalidation
        lars_cv = LassoLarsCV(cv=6, n_jobs=n_jobs).fit(X_train, y_train)
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

        clf = RandomizedLasso(alpha=alphas,
                              random_state=42,
                              n_jobs=n_jobs,
                              n_resampling=n_resampling)
        clf.fit(X_train, y_train)
        importances = clf.scores_
        indices = np.argsort(importances)[::-1]

        pl.bar(range(len(featnames)),
               importances[indices],
               color="r",
               align="center")
        pl.xticks(np.arange(len(featnames)) + 0.5,
                  featnames[indices],
                  rotation=45,
                  horizontalalignment='right')
        pl.xlim(-0.5, len(featnames) - 0.5)
        pl.subplots_adjust(bottom=0.2)

        pl.ylim(0, np.max(importances) * 1.01)
        pl.ylabel('Selection frequency (%) for %d resamplings ' % n_resampling)
        pl.title("Stability Selection: Selection Frequencies")
Example #3
0
def compute_randomizedlasso(F_train, X_train, config, out_dir, feat_names):
    """
    Compute RandomizedLasso feat selection.

    Do RandomizedLasso to select features over each cluster. Return selected
    features.
    """
    scores = []
    clusters = []
    for i in X_train.clusters.unique():
        selected = X_train[X_train.clusters == i]
        selected_rid = selected[["RID", "DX_bl"]]
        selected_rid = selected_rid[((selected_rid.DX_bl == 'CN') |
                                     (selected_rid.DX_bl == 'AD'))]
        selected_rid = selected_rid.merge(F_train, 'inner', on='RID')
        X = np.array(selected_rid[feat_names])
        Y = np.array(selected_rid["DX_bl"])
        Y = np.array([1.0 if x == 'AD' else -1.0 for x in Y])
        rl = RandomizedLasso(alpha='bic',
                             n_resampling=500,
                             fit_intercept=False,
                             sample_fraction=0.85,
                             scaling=0.1,
                             random_state=1714)
        rl.fit(X, Y)
        scores.append(rl.scores_)
        clusters.append(i)
    return normalize(scores), clusters
Example #4
0
def run_rndlasso(X,
                 y,
                 alpha,
                 n_resampling=500,
                 sample_fraction=0.1,
                 n_threads=1):
    """  Implement Randomized Lasso in sklearn

    Args:
        X (np.array): scaled X. 
        y (pd.df): four columns response table. 
        alpha (float): parameter trained from lassoCV 
        n_resampling (int): number of times for resampling 
        sample_fraction (float): fraction of data to use at each resampling

    Returns:
        np.array: feature importance scores

    """
    logger.info(
        'Implementing Randomized Lasso with alpha={}, n_resampling={} and sample_fraction={}'
        .format(alpha, n_resampling, sample_fraction))
    # generate logit response
    y_logit = logit((y.nMut + 0.5) / (y.length * y.N))
    reg = RandomizedLasso(alpha=alpha,
                          n_resampling=n_resampling,
                          sample_fraction=sample_fraction,
                          selection_threshold=1e-3,
                          max_iter=3000,
                          normalize=False,
                          n_jobs=n_threads)
    rndlasso = reg.fit(X, y_logit)
    fi_scores = rndlasso.scores_
    return fi_scores
def lasso_fs(X, y):
    rlasso = RandomizedLasso()
    rlasso.fit(X, y)
    classes = range(0, X.shape[1])

    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), classes), reverse=True)
Example #6
0
def featureSelection(train_x, train_y):
    # Create the RFE object and compute a cross-validated score.
    svc = LinearSVC(C=1, class_weight='balanced')
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    lasso = RandomizedLasso()
    lasso.fit(train_x, train_y)
    rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy')
    rfecv.fit(train_x, train_y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    rankings = rfecv.ranking_
    lasso_ranks = lasso.get_support()
    lassoFeats = []
    recursiveFeats = []
    shouldUseFeats = []

    for i in range(len(rankings)):
        if lasso_ranks[i]:
            lassoFeats.append(feats[i])
        if rankings[i] == 1:
            recursiveFeats.append(feats[i])
            if lasso_ranks[i]:
                shouldUseFeats.append(feats[i])
    keyboard()
    print 'Should use ' + ', '.join(shouldUseFeats)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Example #7
0
def _stab_select(x_train, y_train):
    '''Perform stability selection.'''
    rlasso = RandomizedLasso(alpha=0.025)
    rlasso.fit(x_train, y_train)

    for vals in reversed(sorted(zip(rlasso.scores_, x_train.columns))):
        print '\t'.join([str(val) for val in vals])
def stability_randomizedlasso(X,y,**rl_parameters):
    """
    Score predictor based on `scikit-learn`_ randomizedlasso stability selection.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **rl_parameters: Named parameters for sklearn randomizedlasso
    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn randomizedlasso stability selection to the regulatory
        relationship between the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = stability_randomizedlasso(tfs,tg)
        >>> scores
        array([0.11 , 0.17 , 0.085])
    """
    regressor = RandomizedLasso(**rl_parameters)
    regressor.fit(X,y)
    scores = np.abs(regressor.scores_)
    return(scores)
def lasso_fs(X, y):
    rlasso = RandomizedLasso()
    rlasso.fit(X, y)
    classes = range(0, X.shape[1])

    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), classes),
                 reverse=True)
Example #10
0
def feature_scoring(X, Y):
    names = ["x%s" % i for i in range(1, 37)]
    ranks = {}

    X = X.values[:, :]
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)

    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X, Y)
    ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X, Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

    f, pval = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names)

    print('startMIC')
    mine = MINE()
    mic_scores = []

    for i in range(X.shape[1]):
        mine.compute_score(X[:, i], Y)
        m = mine.mic()
        mic_scores.append(m)
        print(i)
    ranks["MIC"] = rank_to_dict(mic_scores, names)

    print('finish MIc')

    r = {}
    for name in names:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)
    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

    print("\t%s" % "\t".join(methods))
    for name in names:
        print("%s\t%s" % (name, "\t".join(
            map(str, [ranks[method][name] for method in methods]))))
Example #11
0
def randomLassoFeatSelect(data, target):
    column_names = list(data.columns.values)

    rlasso = RandomizedLasso(alpha=0.1)
    rlasso.fit(data, target)

    print "Features sorted by their score:"
    print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), column_names),
                 reverse=True)
Example #12
0
def run_rlasso(ranks):
    print('>> run rlasso/Stability')
    # Finally let's run our Selection Stability method with Randomized Lasso
    rlasso = RandomizedLasso(alpha=0.04, verbose=3)
    rlasso.fit(X, Y)
    ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames)
    print('finished')
    print_memory()
    return ranks
Example #13
0
def stability_selection(option, opt, value, parser):
    rlasso = RandomizedLasso()
    rlasso.fit(X, y)

    print "\nStability Selection: Features sorted by rank:"
    pprint(
        sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), feature_names),
               reverse=True))
    print
Example #14
0
def select_feature_importance():
    data4columns = dataset.drop(['Max overdue'], axis=1)
    column_names = np.asarray(data4columns.columns.values)
    lasso = RandomizedLasso(alpha=0.025)
    scaled_data = scaler.fit_transform(data)
    lasso.fit(scaled_data, target)
    scores = lasso.scores_
    #  column_names
    #  print scores
    print sorted(zip(map(lambda x: round(x, 4), scores), column_names), reverse=True)
def linear_regression_weight(df, label, black_list=[]):
    #稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
    #它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
    #比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
    #理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
    X = df.drop(black_list, axis=1)
    rlasso = RandomizedLasso(alpha=0.025)
    rlasso.fit(X.values, label)
    d = dict(zip(X.columns, rlasso.scores_))
    return d
Example #16
0
    def stability(self, X, y):
        print("Performing stability (rlasso) analysis")

        from sklearn.linear_model import RandomizedLasso
        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X, y)

        scores = np.absolute(rlasso.scores_) / np.absolute(
            rlasso.scores_).sum()
        ranks = self.rank_to_dict(np.abs(scores), X.columns.values)
        return ranks
Example #17
0
def feature_selection(Xnew, Y):
    train_cols = Xnew.columns.tolist()
    rlasso = RandomizedLasso(alpha=0.005)
    rlasso.fit(Xnew, Y)
    print("features sorted by their socre:")
    featureRanks = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                              train_cols),
                          reverse=True)
    print(featureRanks)
    selectedFeats = [feat[1] for feat in featureRanks if feat[0] > 0.01]
    return selectedFeats, featureRanks
Example #18
0
def rlassoSS(data, labels):

    names = data.columns
    rlasso = RandomizedLasso(alpha=0.025)
    rlasso.fit(data, labels)

    print("Features sorted by their score:")
    result = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names),
                    reverse=True)
    for i in result:
        print(i)
Example #19
0
def run_rndlasso(X_train, ybinom_train, alpha,
    n_resampling=200, sample_fraction=0.4):
    ''' Implement RandomizedLasso provided by sklearn
    '''
    logger.info('Implementing Randomized Lasso with alpha={}, n_resampling={} and sample_fraction={}'.format(alpha, n_resampling, sample_fraction))
    # generate logit response
    ylogit_train = logit(ybinom_train[:,0]/ybinom_train.sum(1))
    clf = RandomizedLasso(alpha=alpha, n_resampling=n_resampling,
        sample_fraction=sample_fraction,
        selection_threshold=1e-3, max_iter=3000, normalize=False)
    rndlasso = clf.fit(X_train, ylogit_train)
    return rndlasso
Example #20
0
def featureRankingMatrix(data, x, y):
    ranks = {}

    colnames = data.columns

    def ranking(ranks, names, order=1):
        minmax = MinMaxScaler()
        ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
        ranks = map(lambda x: round(x, 2), ranks)
        return dict(zip(names, ranks))

    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(x, y)
    ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames)
    lr = LinearRegression(normalize=True)
    lr.fit(x, y)
    rfe = RFE(lr, n_features_to_select=1, verbose=3)
    rfe.fit(x, y)
    ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)

    lr = LinearRegression(normalize=True)
    lr.fit(x, y)
    ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames)

    ridge = Ridge(alpha=7)
    ridge.fit(x, y)
    ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames)

    lasso = Lasso(alpha=.05)
    lasso.fit(x, y)
    ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames)

    rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
    rf.fit(x, y)
    ranks["RF"] = ranking(rf.feature_importances_, colnames)

    r = {}
    for name in colnames:
        r[name] = round(
            np.mean([ranks[method][name] for method in ranks.keys()]), 2)
    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    meanplot = pd.DataFrame(list(r.items()),
                            columns=['Feature', 'Mean Ranking'])
    meanplot = meanplot.sort_values('Mean Ranking', ascending=False)
    sns.factorplot(x="Mean Ranking",
                   y="Feature",
                   data=meanplot,
                   kind="bar",
                   size=14,
                   aspect=1.9,
                   palette='coolwarm')
    def Randomlasso(self,data):
        a1= data
        a1=a1.dropna()
        Y =a1['price'].values
        X=a1[a1.columns[5:27]].values
        names=list(range(1,22))

        rlasso = RandomizedLasso(alpha=0.025)
        rlasso.fit(X, Y)

        print("Features sorted by their score:")
        print(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                         names), reverse=True))
Example #22
0
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        alpha = kwargs.get("alpha", 'aic')
        scaling = kwargs.get("scaling", 0.5)
        sample_fraction = kwargs.get("sample_fraction", 0.75)
        n_resampling = kwargs.get("n_resampling", 10)

        randomized_lasso = RandomizedLasso(alpha=alpha,
                                           scaling=scaling,
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling)
        randomized_lasso.fit(df_features.values, np.ravel(df_target.values))

        return randomized_lasso.scores_
Example #23
0
 def auto_add_lasso(self, threshold):
     """ add features based on randomized lasso """
     logging.info("[DataSelector] Starting randomized lasso...")
     names = self.train_x.columns.tolist()
     rlasso = RandomizedLasso(alpha=0.005)
     rlasso.fit(self.train_x, self.train_y)
     result = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names),
                     reverse=True)
     for i, j in result:
         r = re.compile('([a-zA-Z]+)([0-9]+)')
         feature = r.match(j).groups()[0]
         idx = r.match(j).groups()[1]
         print("Feature: {}, idx: {}".format(feature, idx))
         if i >= threshold:
             self.add(feature, idx)
Example #24
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable)
    # TODO(PRESTON): optimize the params used here
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Example #25
0
def feature_selection(path, target):
    X = load_df(path)
    y = X[target]
    X = X.drop(target, axis=1)

    model = Pipeline([("imputer",
                       Imputer(missing_values='NaN', strategy="mean", axis=1)),
                      ('feature', RandomizedLasso()),
                      ("model", LinearRegression())])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    model.fit(X_train, y_train)
    R2 = model.score(X_test, y_test)
    ypred = model.predict(X_test)
    mse = mean_squared_error(y_test, ypred)
    print "R^2 (Linear Regression + feature selection): ", R2
    print "mse (Linear Regression + feature selection): ", mse

    features = model.named_steps['feature']

    selected_features = X.columns[features.transform(np.arange(len(
        X.columns)))].values.tolist()[0]

    return selected_features
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1,
                                                   max_depth=10,
                                                   n_estimators=15),
                            threshold='20*mean'),
            'RFECV':
            RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLogisticRegression(),
            'KeepAll':
            'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1,
                                                  max_depth=10,
                                                  n_estimators=15),
                            threshold='0.7*mean'),
            'RFECV':
            RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLasso(),
            'KeepAll':
            'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Example #27
0
def feature_selection_regression(predictors, responses, test_predictors,
                                 selectfeattech):
    if selectfeattech == 0:
        chk = int(predictors.shape[1] * 0.40)
        # have fixed the value of how many features are to be selected as of now.
        model = SelectKBest(f_regression, k=10)
        model = model.fit(predictors, responses)
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)
        print "SelectKBest -> " + str(len(indices))

    if selectfeattech == 1:
        model = RandomizedLasso(alpha='aic',
                                scaling=0.3,
                                sample_fraction=0.60,
                                n_resampling=200,
                                selection_threshold=0.15)
        model = model.fit(predictors, responses)
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)
        print "Randomized Lasso -> " + str(len(indices))

    column_names = predictors.columns[indices]
    predictors_new = pd.DataFrame(predictors_new,
                                  index=predictors.index,
                                  columns=column_names)
    predictors_test_new = pd.DataFrame(predictors_test_new,
                                       index=test_predictors.index,
                                       columns=column_names)
    return predictors_new, predictors_test_new
Example #28
0
def run_lasso_on_input(df, target):
   
	X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0]))

	X_part, _ = scale_input_data(X_part)

	print "#######################################"
	print "Starting LARS CV"
	print "#######################################"

	lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part)

	print "#######################################"
	print "Done with LARS CV"
	print "#######################################"

	#alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
	
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0])

	X, _ = scale_input_data(X)

	print "#######################################"
	print "Starting main lasso"
	print "#######################################"

	clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) 

	print "#######################################"
	print "Done with main lasso"
	print "#######################################"

	return clf, column_list_for_sampled
Example #29
0
def stability(frame):
    data_y = frame['target']
    data_x = frame.drop('target', axis=1)
    selection = RandomizedLasso(alpha=0.0011,
                                scaling=0.8,
                                sample_fraction=0.6,
                                max_iter=100000).fit_transform(data_x, data_y)
    print(selection.shape)
 def feature_extraction_RandomLasso(flag = True):
    from sklearn.linear_model import RandomizedLasso
    if flag == True:
        X_train = pd.read_csv('feature_001.csv')
        X_train.drop('id',axis = 1,inplace = True)
        X_train = parse_nan(X_train)
        y_train = pd.read_csv('target.csv')
        print(type(X_train))
        for i in X_train.columns:
            X_train[i] = X_train[i].astype('float16')
        print(X_train.info(memory_usage = 'deep'))
        print(y_train.info(memory_usage = 'deep'))
        print("稳定性选择法提取特征开始...")
        #print(X_train.isnull().sum().sort_values(ascending=False).head())
        NUM = 20
        randomLasso = RandomizedLasso()
        randomLasso.fit(X_train, y_train)
        features = randomLasso.scores_
        score = X_train.columns
        print(features)
        print(sorted(zip(map(lambda x:round(x,4),features),score),reverse = True))
        featureList = sorted(zip(map(lambda x:round(x,4),features),score),reverse = True)
        featureList = [i[1] for i in featureList][:NUM]
        X_train = X_train[featureList]
        print(X_train.shape)
        if X_train.shape[1]!= NUM:
            raise NotImplementedError("稳定性选择法提取特征处理失败")
        print("稳定性选择法提取特征结束...")
        X_train.to_csv('feature_tree_end.csv')
    else:
        X_train = pd.read_csv('feature_linear_end.csv')
        y_train = pd.read_csv('target.csv')
        X_train.drop('id',axis = 1,inplace = True)
        X_train = parse_nan(X_train)
        print("稳定性选择法提取特征开始...")
        print(X_train.isnull().sum().sort_values(ascending=False).head())
        NUM = 30
        randomLasso = RandomizedLasso()
        randomLasso.fit(X_train, y_train)
        features = randomLasso.scores_
        score = X_train.columns
        print(features)
        print(sorted(zip(map(lambda x:round(x,4),features),score),reverse = True))
        featureList = sorted(zip(map(lambda x:round(x,4),features),score),reverse = True)
        featureList = [i[1] for i in featureList][:NUM]
        X_train = X_train[featureList]
        print(X_train.shape)
        if X_train.shape[1]!= NUM:
            raise NotImplementedError("稳定性选择法提取特征处理失败")
        print("稳定性选择法提取特征结束...")
        X_train.to_csv('feature_linear_best.csv')
    return X_train
Example #31
0
def rank_features(algorithm, X, y):
    # The RFE approach can be used with various different classifiers
    if algorithm == 'random_forest_rfe':
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.feature_selection import RFE
        estimator = RandomForestClassifier(n_estimators=50,
                                           random_state=R_SEED,
                                           n_jobs=1)
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'svm_rfe':
        from sklearn.svm import SVC
        from sklearn.feature_selection import RFE
        estimator = SVC(random_state=R_SEED, kernel='linear')
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'random_logistic_regression':
        # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
        from sklearn.linear_model import RandomizedLogisticRegression
        rlasso = RandomizedLogisticRegression(random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'random_lasso':
        from sklearn.linear_model import RandomizedLasso
        rlasso = RandomizedLasso(random_state=R_SEED)
        #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'anova':
        from sklearn.feature_selection import f_classif
        F, pval = f_classif(X, y)
        random_array = random.random(len(pval))
        order = lexsort((random_array, pval))  # will break ties by random
        for i in order:
            print features[i]
    else:
        print "Invalid algorithm: %s" % algorithm
        exit(1)
Example #32
0
def featureSelect(select_fun,
                  train_data,
                  train_label,
                  threshold_num=0,
                  alpha=0.000015):
    X = train_data  # train data
    Y = train_label  # train label
    feture_names = list(train_data.columns)  # 现有的特征名字
    importance_features_list = []

    if select_fun == 'MeanDecreaseImpurity':
        '''平均不纯度减少 mean decrease impurity'''
        rf = RandomForestRegressor(random_state=2019)
        rf.fit(X, Y)
        feature_score = sorted(
            zip(feture_names,
                map(lambda x: round(x, 4), rf.feature_importances_)))
    elif select_fun == 'StabilitySelection':
        '''稳定性选择 StabilitySelection'''
        rlasso = RandomizedLasso(alpha,
                                 random_state=2019)  # alpha太大会导致所有特征都会为0,为1最好
        rlasso.fit(X, Y)
        feature_score = sorted(
            zip(feture_names, map(lambda x: round(x, 4), rlasso.scores_)))
    else:
        importance_features_list = [
            'MeanDecreaseImpurity', 'StabilitySelection',
            'RecursiveFeatureElimination', 'MeanDecreaseAccuracy'
        ]
        print("可选挑选特征的方法名:", importance_features_list)
        return importance_features_list

    for item in feature_score:
        if item[1] > threshold_num:
            importance_features_list.append(item[0])
        else:
            continue

    return importance_features_list
Example #33
0
def lasso():
    columns = [
        col for col in data.columns if col not in [
            'id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
            'question1_tk', 'question2_tk'
        ]
    ]
    columns = [col for col in columns if col not in FEATURES_CORR]
    X = data[columns]
    X.fillna(0, inplace=True)
    Y = data.is_duplicate
    rlasso = RandomizedLasso(alpha=0.025)
    rlasso.fit(X, Y)
    print(
        sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), columns),
               reverse=True))

    svm = LinearSVC(C=0.75)
    svm.fit(X, Y)
    print(
        sorted(zip(map(lambda x: abs(round(x, 4)), svm.coef_[0]), columns),
               reverse=True))
Example #34
0
def stable_select(df, y, rd_reg_columns, threshold=0.2, model='rlr'):
    X = df.loc[:, rd_reg_columns]
    Y = df[y]
    if model == 'rlr':
        rlr = RLR(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold)  # 随机逻辑回归
        rlr.fit(X, Y)
        scores = rlr.scores_
    elif model == 'rls':
        rls = RLS(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold)  # 随机Lasso回归
        rls.fit(X, Y)
        scores = rls.scores_
    elif model == 'rfr':
        rf = RFR()
        rf.fit(X, Y)
        scores = rf.feature_importances_
    else:
        pass
    result = pd.Series(dict(zip(X.columns, scores))).rename('score').sort_values(ascending=False)
    plt.figure(figsize=(20, 10))
    result.plot.barh(title='Feature Importances', color='lightblue')
    plt.ylabel('Feature Importance Score')
    return result
def lass_varselect(train, num_vars, target, alpha):   
    lass = RandomizedLasso(alpha=alpha, n_resampling=5)
    lass.fit(train[num_vars], train[target])
    return lass.get_support()
def feature_selection(df,dfo,target_column,id_column):
  """
  df = The training dataframe
  dfo = The test dataframe
  target_column = The column containing the target variable
  id_column = The column containing the id variable
  
  Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve.
  If the output column is binary (0/1), we use Genetic Algorithms for feature selection.
  If the 
  """
    print("IDENTIFYING TYPES...")
    in_model = []
    list_ib = set()  #input binary
    list_icn = set() #input categorical nominal
    list_ico = set() #input categorical ordinal
    list_if = set()  #input numerical continuos (input float)
    list_inputs = set()
    output_var = target_column



    for var_name in df.columns:
        if re.search('^ib_',var_name):
            list_inputs.add(var_name)      
            list_ib.add(var_name)
            print (var_name,"is input binary")
        elif re.search('^icn_',var_name):
            list_inputs.add(var_name)      
            list_icn.add(var_name)
            print (var_name,"is input categorical nominal")
        elif re.search('^ico_',var_name):
            list_inputs.add(var_name)      
            list_ico.add(var_name)
            print (var_name,"is input categorical ordinal")
        elif re.search('^if_',var_name):
            #list_inputs.add(var_name)      
            list_if.add(var_name)
            print (var_name,"is input numerical continuos (input float)")
        elif re.search('^ob_',var_name):
            output_var = var_name
        else:
            print ("ERROR: unable to identify the type of:", var_name)
            
            
    if (df[output_var].isin([0,1]).all()):
        method_type = 'categorical'
    else:
        method_type = 'numerical'
        
    print(method_type)

    if method_type == "categorical":
        methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"]
    elif method_type == "numerical":
        methods = ["SVM","Ridge","Lasso"]


    if method_type == "categorical":
        print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):")

        #####
        #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
        #####
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs))
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        def evalOneMax(individual):
            return sum(individual),

        toolbox.register("evaluate", evalOneMax)
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
        toolbox.register("select", tools.selTournament, tournsize=3)

        NPOPSIZE = 50 #RANDOM STARTING POOL SIZE
        population = toolbox.population(n=NPOPSIZE)


    #####
    #ASSESSING GINI ON THE STARTING POOL
    #####
    dic_gini={}
    for i in range(np.shape(population)[0]): 

        # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
        var_model = []    
        for j in range(np.shape(population)[0]): 
            if (population[i])[j]==1:
                var_model.append(list(list_inputs)[j])

        # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL 
                
        X_train=df[var_model]
        Y_train=df[output_var]

        ######
        # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####     
        if "Logistic Regression" in methods:
            lr = sm.Logit(Y_train, X_train)
            model=lr.fit()   
            Y_predict=model.predict(X_train)
        ######
        # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
        #####             


        ######
        # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####                
            fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
            auc = metrics.auc(fpr, tpr)
            gini_power = abs(2*auc-1)
        ######
        # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
        #####                
        
            gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
            dic_gini[gini]=population[j]   
        list_gini=sorted(dic_gini.keys(),reverse=True)


    ####
    # ASSESSING RMSE ON THE STARTING POOL
    ####
    if method_type == "numerical":
        X_train=df[var_model]
        Y_train=df[output_var]
        
        names = list(X_train)
        ranks = {}
        
        lr = LinearRegression(normalize=True)
        lr.fit(X_train, Y_train)
        ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)

        ridge = Ridge(alpha=7)
        ridge.fit(X_train, Y_train)
        ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)


        lasso = Lasso(alpha=.05)
        lasso.fit(X_train, Y_train)
        ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)


        rlasso = RandomizedLasso(alpha=0.04)
        rlasso.fit(X_train, Y_train)
        ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)  
        
        rf = RandomForestRegressor()
        rf.fit(X_train,Y_train)
        ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
        
        f, pval  = f_regression(X_train, Y_train, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2)
            
        

        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")
        print(ranks["Mean"])
        
        print("\t\t%s" % "\t".join(methods))
        for name in names:
            print ("%s\t%s" % (name, "\t".join(map(str, 
                [ranks[method][name] for method in methods]))))
        
        ranks_f = pd.DataFrame(ranks)
        ranks_f.sort_values("RF",0,0,inplace = True)
        
        print(ranks_f)
        
        featureset = ranks_f.index.values[0:5]
        
        print(featureset)

    if method_type == "categorical":   
        #GENETIC ALGORITHM MAIN LOOP - START
        # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)
        #####
        sum_current_gini=0.0
        sum_current_gini_1=0.0
        sum_current_gini_2=0.0
        first=0    
        OK = 1
        a=0
        while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
            a=a+1
            print('loop ', a)
            OK=0

            ####
            # GENERATING OFFSPRING - START
            ####
            offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
            fits = toolbox.map(toolbox.evaluate, offspring)
            for fit, ind in zip(fits, offspring):
                ind.fitness.values = fit
            population =toolbox.select(offspring, k=len(population))
            ####
            # GENERATING OFFSPRING - END
            ####

            sum_current_gini_2=sum_current_gini_1
            sum_current_gini_1=sum_current_gini
            sum_current_gini=0.0

            #####
            #ASSESSING GINI ON THE OFFSPRING - START
            #####
            for j in range(np.shape(population)[0]): 
                if population[j] not in dic_gini.values(): 
                    var_model = [] 
                    for i in range(np.shape(population)[0]): 
                        if (population[j])[i]==1:
                            var_model.append(list(list_inputs)[i])

                    X_train=df[var_model]
                    Y_train=df[output_var]

                    ######
                    # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####            
                    lr = sm.Logit(Y_train, X_train)
                    model=lr.fit()
                    Y_predict=model.predict(X_train)
                    ######
                    # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
                    #####            


                    ######
                    # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####                       
                    fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
                    auc = metrics.auc(fpr, tpr)
                    gini_power = abs(2*auc-1)
                    ######
                    # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
                    #####                       

                    gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
                    dic_gini[gini]=population[j]  
            #####
            #ASSESSING GINI ON THE OFFSPRING - END
            #####

            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
            #####           
            list_gini=sorted(dic_gini.keys(),reverse=True)
            population=[]
            for i in list_gini[:NPOPSIZE]:
                population.append(dic_gini[i])
                gini=float(i.split(';')[0])
                sum_current_gini+=gini
            #####
            #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END
            #####           

            #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
            print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2)
            if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001):
                OK=1
        #####
        #GENETIC ALGORITHM MAIN LOOP - END
        #####

    if method_type == "categorical":
        
        gini_max=list_gini[0]        
        gini=float(gini_max.split(';')[0])
        features=gini_max.split(';')[1]


        ####
        # PRINTING OUT THE LIST OF FEATURES
        #####
        f=0
        for i in range(len(features)):
            if features[i]=='1':
                f+=1
                print('feature ', f, ':', list(list_inputs)[i])
        print ('gini: ', gini)
        
        featureset = features

    return featureset
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    attribute[idx] = attr


# In[ ]:

# select  the best features with true values and save them
features = pd.read_csv('all_features.csv',index_col=0).sort()
target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included
for i in range(21):
    print(attribute[i])
    sys.stdout.flush()
    
    
    Y = target[attribute[i]].dropna()
    X = features.loc[Y.index]
    selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200,
                               random_state=25).fit(X,Y)
    selected = pd.DataFrame(selector.transform(features))
    selected.index = features.index
    print('shape ', selected.shape)
    
    selected.to_csv('...path to features folder/selected_features/features_'+str(i)+'.csv')


# In[ ]:



#4 两种顶层特征选择算法

#4.1 稳定性选择 (Stability selection)  [0,1]
#它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
#比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)

from sklearn.linear_model import RandomizedLasso  #随机Lasso
from sklearn.datasets import load_boston
boston = load_boston()
#using the Boston housing data.
#Data gets scaled automatically by sklearn's implementation
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rlasso = RandomizedLasso(alpha=0.025) #alpha自动选择最优的值
rlasso.fit(X, Y)
print "Features sorted by their score:"      #得分:rlasso.scores_
print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)

#结论:好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。
#对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一


#4.2 递归特征消除 (Recursive feature elimination (RFE))    最优特征子集贪心算法
#反复的构建模型(如SVM或者回归模型)然后选出最好的(或者最差的)的特征(可以根据系数来选),把选出来的特征放到一遍,
#然后在剩余的特征上重复这个过程,直到所有特征都遍历了。这个过程中特征被消除的次序就是特征的排序

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression,Ridge
boston = load_boston()
Example #39
0
def run(args):
    X_train = np.nan_to_num(
        np.genfromtxt(args.training_data, delimiter=args.delimiter))
    y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1)

    X_trains = X_train
    if args.scale:
        print "Scaling features (mean removal divided by std)..."
        scaler = StandardScaler().fit(X_train)
        X_trains = scaler.transform(X_train)

    # create output folders
    outF = args.output_folder + "/" + os.path.basename(
        args.training_data) + "--FS_" + str(
        args.select_features) + "--i_" + str(args.iterations)
    buildDir(outF)
    maskF = outF + "/masks/"
    buildDir(maskF)
    #evaluation  features  first_experiments  labels  logs  masks  parameters
    #  predictions  src  suca
    paramF = outF + "/parameters/"
    buildDir(paramF)
    #featF = outF+"/features/"
    #buildDir(featF)    

    #evalF = buildDir(outF+"/evaluation")



    #os.path.basename(
    #        args.training_data)]) + featsel_str + "--" + os.path.basename(
    # test_label



    # initializes numpy random seed
    np.random.seed(args.seed)

    # performs feature selection
    featsel_str = ".all-feats"
    if args.select_features:
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                                  n_jobs=8, random_state=args.seed,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join(
            #    [".", "masks", os.path.basename(args.training_data)])
            [maskF, os.path.basename(args.training_data)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1)

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
        {"n_estimators": [5, 10, 50, 100, 200, 500],
         "max_depth": [3, 2, 1, None],
         "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
         "min_samples_split": sp_randint(1, 11),
         "min_samples_leaf": sp_randint(1, 11),
         "bootstrap": [True, False]}
         # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator, param_distributions,
                                n_iter=args.iterations,
                                scoring=mae_scorer, n_jobs=8, refit=True,
                                cv=KFold(X_train.shape[0], args.folds, shuffle=True,
                                         random_state=args.seed), verbose=1,
                                random_state=args.seed)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................ 
    
    models_dir = sorted(glob.glob(args.models_dir + os.sep + "*"))
    
    estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
                                     max_depth=search.best_params_["max_depth"], 
                                     max_features=search.best_params_["max_features"],
                                     min_samples_leaf=search.best_params_["min_samples_leaf"], 
                                     min_samples_split=search.best_params_["min_samples_split"], 
                                     n_estimators=search.best_params_["n_estimators"], 
                                     verbose=1, 
                                     random_state=42, 
                                     n_jobs=8)
   
    estimator2.fit(X_trains,y_train)
    from sklearn.externals import joblib
    print "koooonnn %s" % args.models_dir
    joblib.dump(estimator2, args.models_dir+"/XRT.pkl")
    joblib.dump(scaler, args.models_dir+"/scaler.pkl")
    joblib.dump(sel_est, args.models_dir+"/sel_est.pkl")
    
#    print "Kioonnn number of feat:\n", n_feature
    # ................SHAHAB ........................

    print "Best parameters: ", search.best_params_

    # saves parameters on yaml file
    #param_path = os.sep.join([".", "parameters", os.path.basename(
    param_path = os.sep.join([paramF, os.path.basename(
        args.training_data)]) + featsel_str + ".params.yaml"
    param_file = codecs.open(param_path, "w", "utf-8")
    yaml.dump(search.best_params_, stream=param_file)
    testF = os.sep.join([outF, "/test/"])
    buildDir(testF)

    m = y_train.mean()

    # evaluates model on the different test sets
    test_features = sorted(glob.glob(args.test_data + os.sep + "*"))
    test_labels = sorted(glob.glob(args.test_labels + os.sep + "*"))
    for test_feature, test_label in zip(test_features, test_labels):
        print "Evaluating on %s" % test_label
    	X_test = np.nan_to_num(
        	np.genfromtxt(test_feature, delimiter=args.delimiter))
    	y_test = np.clip(np.genfromtxt(test_label), 0, 1)

    	X_tests = X_test
    	if args.scale:
        	X_tests = scaler.transform(X_test)

    	if args.select_features:
        	X_tests = sel_est.transform(X_tests)

    	# gets predictions on test set
    	#y_pred = search.predict(X_tests)
    	y_pred = np.clip(search.predict(X_tests), 0, 1)

    	# evaluates on test set
    	mae = mean_absolute_error(y_test, y_pred)
    	rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    	print "Test MAE = %2.8f" % mae
    	print "Test RMSE = %2.8f" % rmse
    	print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max())
    	# saves evaluation
    	testFX = testF + "/" + os.path.basename(test_label)
    	buildDir(testFX)
    	buildDir(testFX + "/evaluation/")

    	eval_path = os.sep.join([testFX, "evaluation", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label)
    	mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8")
    	mae_eval.write(str(mae) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8")
    	rmse_eval.write(str(rmse) + "\n")

    	mu = m * np.ones(y_test.shape[0])  # baseline on test set
    	maeB = mean_absolute_error(y_test, mu)
    	rmseB = np.sqrt(mean_squared_error(y_test, mu))
    	print "Test MAE Baseline= %2.8f" % maeB
    	print "Test RMSE Baseline= %2.8f" % rmseB
    	mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8")
    	mae_eval.write(str(maeB) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8")
    	rmse_eval.write(str(rmseB) + "\n")



	# saves predictions
	buildDir(testFX + "/predictions/")
	preds_path = os.sep.join([testFX, "predictions", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label) + ".preds"
	np.savetxt(preds_path, y_pred, fmt="%2.15f")
Example #40
0
    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self
Example #41
0
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature
'from_poi_to_this_person', 'from_messages', \
'from_this_person_to_poi', 'shared_receipt_with_poi','from_poi_fraction','to_poi_fraction',\
'tot_to_salary','tot_to_bonus','restr_to_total']
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

#SCALE FEATURES:
#For RandomForest and DecisionTree, scaling is not necessary. 

#scaler = MinMaxScaler()
#features = scaler.fit_transform(features)


#Stability Selection:
#http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
rlasso = RandomizedLasso(random_state=2)
rlasso.fit(features,labels)
scores = rlasso.scores_
print scores

for j in range(len(scores)):
    print features_list[j+1],": ",scores[j]
    
features_list_selected = ['poi']
for j in np.where(scores > 0.3)[0]:
    features_list_selected.append(features_list[j+1])


print "-------------Selected features:-------------"
print features_list_selected
def main():
    print "read train"
    df_train = pd.read_csv('data/train.csv')
    print "read test"
    df_test = pd.read_csv('data/test.csv')
    sample = pd.read_csv('data/sampleSubmission.csv')
    
    cats = ['var1', 'var2', 'var3', 'var4', 'var5', 
            'var6', 'var7', 'var8', 'var9', 'dummy']
            
    print "convert mixed columns to strings"
    df_train.loc[:, cats] = df_train[cats].applymap(str)
    df_test.loc[:, cats] = df_test[cats].applymap(str)
    
    print "one-hot encoding"
    df_train = make_dummies(df_train, cats)
    df_test = make_dummies(df_test, cats)

    print "fill missing values"
    df_train = df_train.fillna(df_train.mean())
    df_test = df_test.fillna(df_test.mean())
    
    print "set binary labels"
    df_train['target_class'] = (df_train.target>0).astype(int)
    
    classes = df_train.target_class.values
    loss = df_train.target.values
    df_train = df_train.drop(['target', 'id', 'target_class'], axis = 1)
    df_test = df_test.drop(['id'], axis = 1)

    build_features = True #flag, determines whether features will be trained or read from file
    
    if build_features:
        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func = f_classif, k = 'all')
        selector_reg = SelectKBest(score_func = f_regression, k = 'all')
        selector_clf.fit(df_train.values, classes)
        selector_reg.fit(df_train.values, loss)
        pvalues_clf = selector_clf.pvalues_
        pvalues_reg = selector_reg.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1
        pvalues_reg[np.isnan(pvalues_reg)] = 1
        
        #put feature vectors into dictionary
        feats = {}
        feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) 
        feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05)
        feats['univ_reg_sub005'] = (pvalues_reg<0.05)
        feats['univ_clf_sub005'] = (pvalues_clf<0.05)
        
        print "randomized lasso feature selector"
        sel_lasso = RandomizedLasso(random_state = 42, n_jobs = 4).fit(df_train.values, loss)
        #put rand_lasso feats into feature dict
        feats['rand_lasso'] = sel_lasso.get_support()
        
        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(df_train.values)
        sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_>0)
        sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes)
        feats['LogReg'] = np.ravel(sel_log.coef_>0)
        
        feat_sums = np.zeros(len(feats['rand_lasso']))
        for key in feats:
            feat_sums+=feats[key].astype(int)
        feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes
        joblib.dump(feats, 'features/feats.pkl', compress = 3)
    
    else:
        feats = joblib.load('features/feats.pkl')
    
    xtrain = df_train.values
    xtest = df_test.values
    
    print "fitting gb-regressor"
    reg_gbr = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.001, max_depth =5, random_state = 42, verbose = 100, min_samples_leaf=5)
    reg_gbr.fit(xtrain[:, feats['ensemble']], loss)
    gbr_preds = reg_gbr.predict(xtest[:, feats['ensemble']])
    sample['target'] = gbr_preds
    sample.to_csv('submissions/gbm_sub.csv', index = False)
    reg_lin = LinearRegression()
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    print "fitting linear regressor"
    reg_lin.fit(xtrain[:, feats['rand_lasso']], loss)
    lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']])
    gbr_order = gbr_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc.
    lin_order = lin_preds.argsort().argsort()
    #averaging
    mean_order = np.vstack((gbr_order, lin_order)).mean(0)    
    sample['target'] = mean_order
    sample.to_csv('submissions/mean_sub.csv', index = False)
def main():
    print "read train"
    df_train = pd.read_csv('./data/train.csv')
    print "read test"
    df_test = pd.read_csv('./data/test.csv')
    sample = pd.read_csv('./data/sample_submission.csv')
    
    cats = ['T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 
            'T1_V9', 'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16',
            'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12',
            'T2_V13']
            
    print "convert mixed columns to strings"
    df_train.loc[:, cats] = df_train[cats].applymap(str)
    df_test.loc[:, cats] = df_test[cats].applymap(str)
    
    print "one-hot encoding"
    df_train = make_dummies(df_train, cats)
    df_test = make_dummies(df_test, cats)
    
    print "set binary labels"
    df_train['hazard_class'] = (df_train.Hazard==1).astype(int)
    
    classes = df_train.hazard_class.values
    # loss = df_train.target.values
    hazard = df_train.Hazard.values
    df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis = 1)
    df_test = df_test.drop(['Id'], axis = 1)

    build_features = False #flag, determines whether features will be trained or read from file
    
    if build_features:
        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func = f_classif, k = 'all')
        selector_reg = SelectKBest(score_func = f_regression, k = 'all')
        selector_clf.fit(df_train.values, classes)
        selector_reg.fit(df_train.values, hazard)
        pvalues_clf = selector_clf.pvalues_
        pvalues_reg = selector_reg.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1
        pvalues_reg[np.isnan(pvalues_reg)] = 1
        
        #put feature vectors into dictionary
        feats = {}
        feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) 
        feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05)
        feats['univ_reg_sub005'] = (pvalues_reg<0.05)
        feats['univ_clf_sub005'] = (pvalues_clf<0.05)
        
        print "randomized lasso feature selector"
        sel_lasso = RandomizedLasso(random_state = 42).fit(df_train.values, hazard)
        #put rand_lasso feats into feature dict
        feats['rand_lasso'] = sel_lasso.get_support()
        
        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(df_train.values)
        sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_>0)
        sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes)
        feats['LogReg'] = np.ravel(sel_log.coef_>0)
        
        feat_sums = np.zeros(len(feats['rand_lasso']))
        for key in feats:
            feat_sums+=feats[key].astype(int)
        feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes
        joblib.dump(feats, './features/feats.pkl', compress = 3)
    
    else:
        feats = joblib.load('features/feats.pkl')

    xtrain = df_train.values
    xtest = df_test.values

    print "fitting xgb-regressor"
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.01
    params["max_depth"] = 7
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.8
    params["min_child_weight"] = 5
    params["silent"] = 1
    plst = list(params.items())
    num_rounds = 600
    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(xtrain[:,feats['ensemble']], label=hazard)
    xgtest = xgb.DMatrix(xtest[:,feats['ensemble']])
    reg_xgb = xgb.train(plst, xgtrain, num_rounds)
    xgb_preds = reg_xgb.predict(xgtest)
    sample['Hazard'] = xgb_preds
    sample.to_csv('./submissions/xgb.csv', index = False)
    reg_lin = LinearRegression()
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    print "fitting linear regressor"
    reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard)
    lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']])
    sample['Hazard'] = lin_preds
    sample.to_csv('./submissions/lin.csv', index = False)
    xgb_order = xgb_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc.
    lin_order = lin_preds.argsort().argsort()
    #averaging
    mean_order = np.vstack((xgb_order, lin_order)).mean(0)    
    sample['Hazard'] = mean_order
    sample.to_csv('./submissions/mean.csv', index = False)
Example #45
0
def main():
    start = time.time()
    MAX_TRAIN_SIZE = 126838
    train_size = 20000
    val_size = MAX_TRAIN_SIZE - train_size
    data, test_data = get_data('data')
    X = data[0:train_size,0:-1]
    y = [lbl for lbl in data[0:train_size,-1]]
    print(X.shape)
    print(len(y))
    # use randomized log regression for feature selection    
    clfR = RandomizedLasso(     alpha='aic', 
                                scaling=0.5, 
                                sample_fraction=0.75, 
                                n_resampling=200, 
                                selection_threshold=0.25, 
                                fit_intercept=True, 
                                verbose=False, 
                                normalize=True, 
                                precompute='auto', 
                                max_iter=500, 
                                eps=2.2204460492503131e-16, 
                                random_state=None, 
                                n_jobs=1, 
                                pre_dispatch='3*n_jobs', 
                                #memory=Memory(cachedir=None)     
                          )  
    # fit regresion
    clfR.fit(X,y)

    # Transform Train Data to selected features
    X = np.array(X).copy() # little hack to fix assignment dest. read only error
    X_new = clfR.transform(X) 
    X = X_new
    ## transform Quiz Dataset
    test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error
    transformed_test_data = clfR.transform(test_data)
    test_data = transformed_test_data

    print('Dimensions after feature Reduction: ' + str(X.shape) ) 
    print("Elapsed Time For Feature Reduction: " + str(duration))
    
    # Training classifier
    clf1 = DecisionTreeClassifier(criterion='gini',
                                  splitter='best',
                                  max_depth=None,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  max_features=None,
                                  random_state=None,
                                  max_leaf_nodes=None,
                                  class_weight=None,
                                  presort=False)

    # fit sub-classifiers
    clf1.fit(X,y)
    # fit voting classifier
    print("Elapsed Time For Classifier Training: " + str(duration))

    # predict & calculate training error
    y_hat = clf1.predict(X)
    test_err = 1
    for yi, y_hati in zip(y, y_hat):
        test_err += (yi == y_hati)
    test_err /= train_size
    print("train: " + str(test_err))

    # validation data - calculate valdiation error
    val_start = train_size
    val_end = train_size + val_size

    # get validation data set
    # TODO: put this back in
    if MAX_TRAIN_SIZE - train_size > val_size:
         print("Beginning test validation...")
         X_val = data[val_start:val_end,0:-1]
         y_val = [lbl for lbl in data[val_start:val_end,-1]]
         y_val_hat = clf1.predict(X_val)
         test_err = 1
         for yi, y_hati in zip(y_val, y_val_hat):
             test_err += (yi == y_hati)
         test_err /= X_val.shape[0]
         print("val: " + str(test_err))

    #quiz data
    print("Beginning quiz validation...")
    # test_data = get_data('quiz')
    X_test = test_data[:,:]
    print(X_test.shape)
    y_test = [lbl for lbl in data[:,-1]]
    y_test_hat = clf1.predict(X_test)
    test_err = 1
#    for yi, y_hati in zip(y_test, y_test_hat):
#        test_err += (yi == y_hati)
#    test_err /= X_test.shape[0]
#    print("test: " + str(test_err))
    store_csv(y_test_hat, "prediction")
    end = time.time()
    duration = end - start
    print("Took this many seconds: " + str(duration))
    for key in final_feats:
        final_inputs[x][count] = final_feats[key][x]
        count = count+1

inputs = [input for input in final_inputs.values()]

# Recursive feature elimination

svr = SVR(kernel="linear")
rfe = RFE(svr, step=1)
rfe = rfe.fit(inputs,outputs[1])
rfe.support_
rfe.ranking_


# selected features by RFE
selected_features = []
count = 0
for key in final_feats.keys():
    if (rfe.support_[count] == True):
        selected_features.append(key)
    count = count + 1
    
    
# Randomized Lasso for feature selection
rlasso = RandomizedLasso(alpha=1)
rlasso.fit(inputs, outputs[2])
rlasso.scores_


Example #47
0
def main(train_label, train_feat, modelsdir, selfeat):

  X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
  y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

  X_trains = X_train
  scaler = StandardScaler().fit(X_train)
  X_trains = scaler.transform(X_train)


    # performs feature selection
  featsel_str = ".all-feats"
  if int(selfeat):
    print "Performing feature selection ..."
    # initializes selection estimator
    sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                              n_jobs=int(config['n_jobs']), random_state=42,
                              n_resampling=1000)
  
    sel_est.fit(X_trains, y_train)
    X_trains = sel_est.transform(X_trains)
  
    selected_mask = sel_est.get_support()
    selected_features = sel_est.get_support(indices=True)
  
    sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])
  
    # saves indices
    np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
    # saves mask
    np.save(sel_feats_path + ".mask", selected_mask)
    featsel_str = ".randcv"


  estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs']))

  mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
  #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

  # performs parameter optimization using random search
  print "Performing parameter optimization ... "


  param_distributions = \
    {"n_estimators": [5, 10, 50, 100, 200, 500],
     "max_depth": [3, 2, 1, None],
     "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
     "min_samples_split": sp_randint(1, 11),
     "min_samples_leaf": sp_randint(1, 11),
     "bootstrap": [True, False]}
   # "criterion": ["gini", "entropy"]}

  search = RandomizedSearchCV(estimator, param_distributions,
            n_iter=int(config['RR_Iter']),
            scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True,
            cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42),
            verbose=1, random_state=42)
  
  # fits model using best parameters found
  search.fit(X_trains, y_train)

  # ................SHAHAB ........................ 
  
  models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))
  
  estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
       max_depth=search.best_params_["max_depth"], 
       max_features=search.best_params_["max_features"],
       min_samples_leaf=search.best_params_["min_samples_leaf"], 
       min_samples_split=search.best_params_["min_samples_split"], 
       n_estimators=search.best_params_["n_estimators"], 
       verbose=1, 
       random_state=42, 
       n_jobs=int(config['n_jobs']))

  print "Train the model with the best parameters ..."
  estimator2.fit(X_trains,y_train)

  from sklearn.externals import joblib
  joblib.dump(estimator2, modelsdir+"/XRT.pkl")
  joblib.dump(scaler, modelsdir+"/scaler.pkl")
  joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
    if attribute is "_all":
        continue
    else:
        # select the columns containing the attribute
        attribute_columns=filter(lambda x:re.search(attribute,x), data.iloc[:,10:].columns)
        X = data[attribute_columns[:20]] # use only 20 mode paramteres
        
    remove_highly_correlated(X,threshold=0.98)
    print(X.columns.values)
    list_dicts = list()
    for train_index, test_index in skf:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train.shape)
        if feature_selection == "randomized_lasso":
            feature_selector=RandomizedLasso(sample_fraction=0.5,n_resampling=50,verbose=False,n_jobs=-1)
        elif feature_selection == "RFECV_linearSVM":
#            print(feature_selection % "selected")
            feature_selector = RFECV(SVC(kernel="linear"),step=1,cv=StratifiedKFold(y,5),scoring="accuracy")
        else:
            print("Options are: randomized_lasso, RFECV_linearSVM")
            
        feature_selector.fit(X_train,y_train)
        result = {'X_train':X_train,'y_train':y_train,'X_test':X_test,'y_test':y_test,'feature_selector':feature_selector}
        list_dicts.append(result)
        
        
    dict_for_attribute[attribute] = list_dicts
    print("done in %0.3fs" % (time()-t0))

Example #49
0
class LinearAll:
    """
    A repertoire of Linear Variable Selection and Prediction Models

    Parameters
    ----------
    n_jobs : int, optional
        Number of jobs to run in parallel (default 1).
        If -1 all CPUs are used. This will only provide speedup for
        n_targets > 1 and sufficient large problems
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be:
        None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’
    refit : boolean
        Refit the best estimator with the entire dataset. If “False”,
        it is impossible to make predictions using this GridSearchCV
        instance after fitting.
    iid : boolean, optional
        If True, the data is assumed to be identically distributed across
        the folds, and the score is computed from all samples individually,
        and not the mean loss across the folds.
        (If the number of data points is the same across folds, either
        returns the same thing)

    Attributes
    ----------
    ols_train,
    predictions models before variable selection
    predictions models after variable selection
    """

    def __init__ (self, cv=20, scoring = 'mean_squared_error',
                  n_jobs=1, refit=False, iid=False, pre_pred=True,
                  param_ridge_post=list(np.arange(1,3,0.1)),
                    rlasso_selection_threshold = 0.5):
        #self.__name__ = '__main__'
        """
        CAUTION: we changed to __main__ so that parallelization works
        """
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.iid = iid
        self.pre_pred =pre_pred
        self.param_ridge_post = param_ridge_post
        self.rlasso_selection_threshold = rlasso_selection_threshold

    def run_models(self, X, y, param_ridge):
        """

        Prediction Models.

        OLS, PLS, Ridge

        """

        ##################################
        ## OLS CV
        ##################################
        #ols = linear_model.LinearRegression(fit_intercept=True,
        #                                          normalize=False,
        #                                          copy_X=True)
        #ols_cv_score = cross_validation.cross_val_score(
        #        ols, X, y,
        #        cv=self.cv, scoring=self.scoring,
        #        n_jobs=self.n_jobs)
        """
        self.ols_cv_score.shape = (cv,)
        """

        ##################################
        ## PLS CV
        ##################################
        tuned_parameters = [{'n_components': range(1, 5)}]
        pls = PLSRegression()
        pls_cv = GridSearchCV(pls, tuned_parameters,
                                cv=self.cv, scoring=self.scoring,
                                n_jobs=self.n_jobs,
                                refit=self.refit, iid=self.iid)
        pls_cv.fit(X, y)


        ##################################
        ## Ridge CV
        ##################################
        tuned_parameters = [{'alpha': param_ridge}]
        ridge = linear_model.Ridge(alpha = 1)
        ridge_cv = GridSearchCV(ridge, tuned_parameters,
                                     cv=self.cv, scoring=self.scoring,
                                     n_jobs=self.n_jobs,
                                     refit=self.refit, iid=self.iid)
        ridge_cv.fit(X, y)

        return (pls_cv, ridge_cv)

    def fit(self, X, y):
        """
        Variable Selection and Prediction.

        Variable Selection Model: lasso
        Prediction Models: see self.predict()

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples, n_targets]
            Target values

        Returns
        -------
        self : returns an instance of self.
        """


        ##################################
        ## OLS Train
        ##################################
        #ols_train = linear_model.LinearRegression(fit_intercept=True,
        #                                         normalize=False,
        #                                          copy_X=True)
        #ols_train.fit(X, y)
        #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2)
        """
        fit_intercept=True, center the data
        copy=True, because centering data invovles X -= X_mean

        CAUTION:
        normalization=False, otherwise involves taking squares of X, lose precision

        self.rss_ols_train.shape = (1,1)
        """

        ##################################
        ## Pre Variable Selection Predictions
        ##################################
        self.pre_pred = False
        if self.pre_pred:
            print "Computing ... "
            param_ridge_pre = list(np.arange(1e9,2e9,1e8))
            self.pls_pre, self.ridge_pre = \
                self.run_models(X, y, param_ridge_pre)

        ##################################
        ## Lasso Variable Selection
        ##################################
        self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto',
                            max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000,
                            eps= 2.2204460492503131e-16,copy_X=True,
                            cv=self.cv, n_jobs=self.n_jobs)
        self.lasso_cv.fit(X, y)
        """
        normalize=True, lasso seems to be able to handle itself
        """

        if self.rlasso_selection_threshold == 0:
            self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_,
                                fit_intercept=True, normalize=True, precompute='auto',
                                max_iter=X.shape[1]+1000,
                                eps=2.2204460492503131e-16, copy_X=True,
                                fit_path=False)
            self.lasso_refit.fit(X, y)
            self.active = self.lasso_refit.coef_ != 0
            self.active = self.active[0,:]
            X_selected = X[:, self.active]
        else:
            self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5,
                                          sample_fraction=0.75, n_resampling=200,
                                          selection_threshold=self.rlasso_selection_threshold, fit_intercept=True,
                                          verbose=False, normalize=True, precompute='auto',
                                          max_iter=500, eps=2.2204460492503131e-16,
                                          random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',)
            self.rlasso.fit(X, y)
            X_selected = self.rlasso.transform(X)

        ##################################
        ## Post Variable Selection Predictions
        ##################################
        self.pls_post, self.ridge_post = \
            self.run_models(X_selected, y, self.param_ridge_post)


        return self

    def predict(self, X_test):
        assert(self.refit == True)
        if self.pls_post.best_score_ > self.ridge_post.best_score_:
            self.best_model = self.pls_post
            print "Chosen Model: pls"
        else:
            self.best_model = self.ridge_post
            print "Chosen Model: ridge"

        if self.rlasso_selection_threshold == 0:
            X_test_selected = X_test[:, self.active]
        else:
            X_test_selected = self.rlasso.transform(X_test)
        return self.best_model.best_estimator_.predict(X_test_selected)
from sklearn.cross_validation import train_test_split
from scipy import io as sio
from tensorflow.python.framework import ops
from dfs2 import DeepFeatureSelectionNew
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import normalize

# ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat")
ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat")
# ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat")

inputX = ourdataB['X']
inputX = normalize(inputX, axis=0)
inputY = ourdataB['Y'][0,:]
columnNames = ourdataB['columnNames']

X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42)

randomized_lasso = RandomizedLasso()
randomized_lasso.fit(X_train, y_train)

featureMask = randomized_lasso.get_support()

X_train_lasso = X_train[:,featureMask]
X_test_lasso = X_train[:,featureMask]

columnNames[0][:100][featureMask]

sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \
			'X_train_lasso':X_test_lasso, 'featureMask':featureMask})