def main():
    train_df = munge_data('./data/train.csv', False)
    train_df = train_df.drop('PassengerId', axis=1)
    target_df = train_df['Survived']
    train_df = train_df.drop('Survived', axis=1)
    train_df = train_df.sort(axis=1)

    test_df = munge_data('./data/test.csv')
    test_ids = test_df.PassengerId.values
    test_df = test_df.drop('PassengerId', axis=1)
    test_df = test_df.sort(axis=1)
    
    train_data = train_df.values
    target_data = target_df.values
    test_data = test_df.values

    clf = svm.SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=5, scoring='accuracy')
    
    train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split(
        train_data, target_data, test_size=0.2)

    selector = selector.fit(train_data, target_data)
    
    print(selector.score(cx_data, cx_target_data))
    cx_predictions = selector.predict(cx_data)
    print(classification_report(cx_target_data, cx_predictions))
    predictions = selector.predict(test_data)

    with open('output.csv', 'w') as o:
        o.write('PassengerId,Survived\n')
        for passenger, prediction in zip(test_ids, predictions):
            o.write('{},{}\n'.format(passenger, prediction))
def main():
    train_df = munge_data('./data/train.csv', False)
    train_df = train_df.drop('PassengerId', axis=1)
    target_df = train_df['Survived']
    train_df = train_df.drop('Survived', axis=1)
    train_df = train_df.sort(axis=1)

    test_df = munge_data('./data/test.csv')
    test_ids = test_df.PassengerId.values
    test_df = test_df.drop('PassengerId', axis=1)
    test_df = test_df.sort(axis=1)

    train_data = train_df.values
    target_data = target_df.values
    test_data = test_df.values

    clf = svm.SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=5, scoring='accuracy')

    train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split(
        train_data, target_data, test_size=0.2)

    selector = selector.fit(train_data, target_data)

    print(selector.score(cx_data, cx_target_data))
    cx_predictions = selector.predict(cx_data)
    print(classification_report(cx_target_data, cx_predictions))
    predictions = selector.predict(test_data)

    with open('output.csv', 'w') as o:
        o.write('PassengerId,Survived\n')
        for passenger, prediction in zip(test_ids, predictions):
            o.write('{},{}\n'.format(passenger, prediction))
def featureSelection(matrix, survival, genes):
    #train test split
    matrix_train, matrix_test, survival_train, survival_test = train_test_split(
        matrix, survival, test_size=0.2, random_state=42)
    clf = BernoulliNB()
    clf.fit(matrix_train, survival_train)
    print("bernoulli classification accuracy")
    classificationAccuracy(matrix_test, survival_test)

    estimator = BernoulliNB()
    selector = RFECV(estimator, step=50, verbose=1)
    selector = selector.fit(matrix_train, survival_train)

    print(selector.ranking_)
    print(selector.predict(matrix_train))
    print(selector.predict(matrix_test))
    print("train data classification accuracy")
    classificationAccuracy(selector.predict(matrix_train), survival_train)
    print("test data classification accuracy")
    classificationAccuracy(selector.predict(matrix_test), survival_test)

    #PRECISION AND RECALL
    selector.transform(matrix_test)
    survival_score = selector.predict(matrix_test)
    average_precision = average_precision_score(survival_test, survival_score)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    disp = plot_precision_recall_curve(selector, matrix_test, survival_test)
    disp.ax_.set_title('Precision-Recall curve: '
                       'AP={0:0.2f}'.format(average_precision))
    plt.show()

    #GENE SELECTION
    #gene_indices = matrix[np.any(cdist(matrix[:,1:], matrix_train)==0, axis=1)]
    i = 0
    x = []
    while i < len(selector.ranking_):
        if selector.ranking_[i] == 1:
            try:
                entrez = cbioportal.Genes.getGeneUsingGET(
                    geneId=genes[i]).result()
                mutation = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
                    entrezGeneId=entrez.entrezGeneId,
                    molecularProfileId='brca_tcga_pan_can_atlas_2018_mutations',
                    sampleListId='brca_tcga_pan_can_atlas_2018_all').result()
                if len(mutation) > 20:
                    x.append(genes[i])
            except:
                pass
        i = i + 1
    print("training genes {} ".format(x))
def call_function():
    try:
        dataset = loadtxt("/".join([DATASET_FOLDER, 'heart.data']),
                          delimiter=",")
        # split data into X and y
        X = dataset[:, 0:np.array(dataset).shape[1] - 1]
        Y = dataset[:, np.array(dataset).shape[1] - 1]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.23,
                                                            random_state=22)
        #use linear regression as the model
        xg = XGBClassifier()
        #rank all features, i.e continue the elimination until the last one
        rfe = RFECV(xg, cv=5, step=1)
        rfe.fit(X_train, y_train)
        y_important_pred = rfe.predict(X_test)

        print("Features sorted by their rank:")
        print(sorted(zip(map(lambda x: x, rfe.ranking_))))
        print(sorted(zip(map(lambda x: x, rfe.support_))))
        print(accuracy_score(y_test, y_important_pred.round()) * 100)
    except:
        e = sys.exc_info()[0]
        print("<p>Error: %s</p>" % e)
Ejemplo n.º 5
0
    def random_forest(self, data, labels):
        if self.verbose:
            logging.info('Implementing Random Forest Classifier...')
        train, test, train_labels, test_labels = train_test_split(
                        data.values, labels, test_size=self.test_size, 
                        random_state=self.random_state)

        train_scaled, test_scaled = scaleData(train, test)
        # print(f'Ravel shit RF: {train_labels.values.ravel()}\n')
        # print(f'Train type: {type(train)}')
        # print(f'Ravel type: {type(train_labels.values.ravel())}')

        rfc = RandomForestClassifier(random_state=101)
        # rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10), scoring='accuracy')
        rfecv = RFECV(estimator=rfc, step=1, cv=7, scoring='accuracy')
        rfecv.fit(train_scaled, train_labels.values.ravel())

        if self.verbose:
            logging.info('Optimal number of features: {}'.format(rfecv.n_features_))
        if self.verbose == 2: 
            showGraph(rfecv.grid_scores_)
            
        prediction = rfecv.predict(test_scaled)
        score = accuracy_score(test_labels, prediction)

        if self.verbose:
            logging.info(f'RF Predictions Complete')
            logging.info('Score: {:0.2f}%\n'.format(score*100))
        if score*100 >= self.acceptance and self.save:
            savePickle(rfecv, 'RF', score*100, self.sampleNumber)

        return score
        import numpy as np
def benchmark_features_selection(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2),
              scoring='accuracy')
    rfecv.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    print(name+"Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")    
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

 
    t0 = time()
    pred = rfecv.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("Saving data to database:")
    save_results_data(cursor, name, testing_identifiant_produit_list, pred)
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr,train_time,test_time
def test_model(model, xtrain, ytrain, feature_list, prefix):
    """ use train_test_split to create validation train/test samples """
    xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain,
                                                    test_size=0.4)

    if DO_RFECV:
        model.fit(xtrain, ytrain)
        if hasattr(model, 'coef_'):
            model = RFECV(estimator=model, verbose=0, step=1,
                          scoring=score_fn, cv=3)

    model.fit(xTrain, yTrain)
    print 'score', model.score(xTest, yTest)
    ypred = model.predict(xTest)
    ### don't allow model to predict negative number of orders
    if any(ypred < 0):
        print ypred[ypred < 0]
        ypred[ypred < 0] = 0

    print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest))

#    debug_output(model, feature_list)

    debug_plots(model, yTest, ypred, prefix)

    return
def predict(X_test, clf_object, x_train, y_train):

    rfecv = RFECV(estimator=clf_object, step=1, cv=5,
                  scoring='accuracy')  #5-fold cross-validation
    rfecv = rfecv.fit(x_train, y_train)
    y_pred = rfecv.predict(X_test)
    return y_pred, rfecv
Ejemplo n.º 9
0
def get_betareg_model(df_claims, test_size, seed):

    df_claims = replace_nans(df_claims)
    X_Train, X_Test, Y_Train, Y_Test = get_train_test(df_claims, test_size,
                                                      seed)

    encoders = get_encoders()

    # Using separate pipelines for transformer and estimator due to RFECV's bug #6321

    transformer_pipe = Pipeline(encoders)

    linear_model = RFECV(estimator=LinearRegression(),
                         scoring='neg_mean_squared_error',
                         step=1,
                         cv=5)

    transformer_pipe.fit(X_Train)

    X_Train_transformed = transformer_pipe.transform(X_Train)
    X_Test_transformed = transformer_pipe.transform(X_Test)

    linear_model.fit(X_Train_transformed, Y_Train)

    linear_preds = linear_model.predict(X_Test_transformed)

    result = {
        'lreg_model': linear_model,
        'lreg_preds': linear_preds,
        'transformer': transformer_pipe,
        'features': get_transformed_column_names(X_Train)
    }

    return result
Ejemplo n.º 10
0
def train_test(X_train, Y_train, X_test, Y_test, cv_params, custom_grid=False):

    if custom_grid:
        random_grid = load_grid(custom_grid)
    else:
        alpha = np.linspace(30000, 20000, 500)
        #solver = ['svd', 'cholesky', 'lsqr']

        # Create the random grid
        random_grid = {'alpha': alpha}
        #'solver' : solver}
    print_grid(random_grid)
    estimator = Ridge(alpha=90000)
    ridge_random = RFECV(estimator, step=500, cv=5, verbose=10)
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    #ridge_random = RandomizedSearchCV(selector, param_distributions = random_grid, n_iter = cv_params["n_iter"],
    #                                      cv = cv_params["cv"], verbose=10, random_state=42, n_jobs = cv_params["n_jobs"],
    #                                      pre_dispatch='2*n_jobs')
    ridge_random.fit(X_train, Y_train)

    best_grid_params = {'alpha': 30000}
    best_random = ridge_random.get_support()
    best_model_params = ridge_random.get_params()
    train_predictions = ridge_random.predict(X_train)
    test_predictions = ridge_random.predict(X_test)
    #metrics
    r_train = pearsonr(Y_train, train_predictions)
    r_test = pearsonr(Y_test, test_predictions)
    mse_train = mse(Y_train, train_predictions)
    mse_test = mse(Y_test, test_predictions)
    metrics = {
        "r_train": r_train,
        "r_test": r_test,
        "mse_train": mse_train,
        "mse_test": mse_test
    }
    print(f"pearsonr train: {r_train}")
    print(f"pearsonr test: {r_test}")
    print(f"mse train: {mse_train}")
    print(f"mse test: {mse_test}")
    print(best_model_params)
    return best_grid_params, best_model_params, train_predictions, test_predictions, metrics, {}
Ejemplo n.º 11
0
def data_prediction():
    train, test = data_preprocessing()
    X = train.drop(columns=['gender'])
    y = train['gender']
    print('[INFO]....trainset shape: ', X.shape)
    print('[INFO]....testset shape: ', test.shape)
    encoding_columns = ['first_item_browsed']

    X, test = category_encoding(encoding_columns, 0.2, X, y, test)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=123)

    ##########################FOR BASE LGBM############################################
    '''model = lgb.LGBMClassifier()
    model.fit(X_train,y_train)
    print('score on validation data: ',model.score(X_test,y_test))
    final_pred = model.predict(test)'''

    ##########################FOR LGBM USING RFECV#########################################
    print('[INFO]....Creating an LGBM model')
    print('[INFO]....Applying RFECV to select 150 features')

    model = lgb.LGBMClassifier()
    model = RFECV(estimator=model,
                  step=10,
                  min_features_to_select=150,
                  scoring='accuracy')
    model.fit(X_train, y_train)

    X_train = model.transform(X_train)
    X_test = model.transform(X_test)
    test = model.transform(test)

    print('[INFO]....After tranformation train shape :', X_train.shape)

    model = lgb.LGBMClassifier()
    model.fit(X_train, y_train)
    print('score on validation data: ', model.score(X_test, y_test))
    final_pred = model.predict(test)

    ###########################FOR STACKING PURPOSE ############################################
    '''basemodel_1,basemodel_2,basemodel_3,meta_model = stacking_models(X_train,X_test,y_train,y_test)
    base_pred_test = np.column_stack((basemodel_1.predict_proba(test)[:,1],basemodel_2.predict_proba(test)[:,1],\
                                     basemodel_3.predict_proba(test)[:,1]))
    
    final_pred = meta_model.predict(base_pred_test)'''

    ###########################FOR NEURAL NETWORK PURPOSE#############################################
    #model = neural_net(X_train,y_train,X_train.shape[1])

    #pd.Series(dict(zip(X.columns.tolist(),model.feature_importances_))).sort_values(ascending=False).head(20).plot(kind='bar')
    return (final_pred)
def select_rfecv():
    data, x, y = data_drop()
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)
    clf_rf = RandomForestClassifier()
    rfecv = RFECV(estimator=clf_rf, step=1, cv=5, scoring='accuracy')
    rfecv = rfecv.fit(x_train, y_train)
    print('best number: ', rfecv.n_features_)
    print('best features: ', x_train.columns[rfecv.support_])
    ac = accuracy_score(y_test, rfecv.predict(x_test))
    print('Acc is: ', ac)
    cm = confusion_matrix(y_test, rfecv.predict(x_test))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.figure()
    plt.xlabel('Number of features selected')
    plt.ylabel('cross validation score of number of selected features')
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Ejemplo n.º 13
0
def RFE_score(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1024,
                                                        stratify=y)
    selector = RFECV(model, cv=3, scoring='f1')
    selector.fit(X_train, y_train)
    y_pred = selector.predict(X_test)
    score = f1_score(y_test, y_pred)
    return selector.get_support(indices=True), score
Ejemplo n.º 14
0
class rfe_LBC(li_LBC):
    def fit(self, X, Y):
        params = self.get_params()
        model = li_LBC(**params)
        self.rfe = RFECV(model)
        self.rfe.fit(X, Y)

    def predict(self, X):
        return self.rfe.predict(X)

    def score(self, X, Y):
        return self.rfe.score(X, Y)
Ejemplo n.º 15
0
class RFR:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = RandomForestRegressor(*args, **kwargs)

    def fit(self, X, y):
        Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(
            axis=1)]
        if Z.shape[0] != X.shape[0]:
            print(
                'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            self.rfe = RFECV(self.model)
            self.rfe.fit(X_, y_)
        else:
            self.model.fit(X_, y_)

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        Z = numpy.full(shape=(X.shape[0], 1),
                       fill_value=numpy.nan,
                       dtype=numpy.float64)
        if self.rfe_cv:
            Z[nan_mask, :] = self.rfe.predict(X_).reshape(-1, 1)
        else:
            Z[nan_mask, :] = self.model.predict(X_).reshape(-1, 1)
        return Z

    def set_params(self, **kwargs):
        self.model.set_params(**kwargs)

    @property
    def feature_importances_(self):
        return self.model.feature_importances_
Ejemplo n.º 16
0
class SupM1DScikit(SupM1D):

    def __init__(self, _model, rfe_enabled=False, grid_cv=None, *args, **kwargs):
        self.rfe = None
        self.rfe_enabled = rfe_enabled
        self.grid = None
        self.grid_cv = grid_cv
        self._model = _model
        self.model = self._model(*args, **kwargs)

    def _fit(self, X, y):

        Z = numpy.concatenate([X, y], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(axis=1)]
        if Z.shape[0] != X.shape[0]:
            print('FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'.format(X.shape[0] - X_.shape[0]))

        if self.grid_cv is not None:
            self.grid = GridSearchCV(estimator=self.model, param_grid=self.grid_cv)
            self.grid.fit(X_, y_)
            self.model = self._model(**self.grid.best_params_)
            if self.rfe_enabled:
                self.rfe = RFECV(self.model)
                self.rfe.fit(X_, y_)
        elif self.rfe_enabled:
            self.rfe = RFECV(self.model)
            self.rfe.fit(X_, y_)
        else:
            self.model.fit(X_, y_)

    def _predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print('PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'.format(X.shape[0] - X_.shape[0]))
        Z = numpy.full(shape=(X.shape[0], 1), fill_value=numpy.nan, dtype=numpy.float64)
        if self.rfe_enabled:
            Z[nan_mask, :] = self.rfe.predict(X_).reshape(-1, 1)
        else:
            Z[nan_mask, :] = self.model.predict(X_).reshape(-1, 1)
        return Z
Ejemplo n.º 17
0
    def ProcessTicker(self, df, y, pred, skipPredict=False):

        X = df.ix[1:, :-1].values

        # we want to predict using the last record
        pred = pred.ix[:, :-1].values

        y = np.delete(y, (0), axis=0)
        # normalize the data for X
        X = preprocessing.StandardScaler().fit_transform(X)

        # spilt the data for the accuracty calculations
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)

        # get the test classifier
        clf_test = self._clf_test

        # fir the test classifier
        clf_test.fit(X_train, y_train)
        # predict from the X_test
        y_pred = clf_test.predict(X_test)
        # compare y_test o the x_test following the classifier that was trained before on the train data
        acc = clf_test.score(X_test, y_test)
        print("accuracy : {}".format(acc))
        # calculate the confusion matrix
        confusionmatrix = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1])

        final = [0]
        if (skipPredict == True):
            #print(confusionmatrix)
            return acc, confusionmatrix, final

        # get the actual classifier
        clf = self._clf_actual
        # use the actual classifier and recursive feature elimination to select the best number of features.
        m = RFECV(clf, scoring='accuracy')
        # fit the classifier
        m.fit(X, y)
        # predict the final recommedation/decision
        final = m.predict(pred)

        print("Final:{}".format(final))
        return acc, confusionmatrix, final
Ejemplo n.º 18
0
def main():
    dataset = load_my_data()
    X = dataset.data
    y = dataset.target - 1
    # split

    # feature normalization
    X = preprocessing.scale(X)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    # Create the RFE object and compute a cross-validated score.
    clf = SVC(kernel='linear')
    # The "accuracy" scoring is proportional to the number of correct
    ticks = time.time()
    # classifications

    rfecv = RFECV(estimator=clf,
                  step=1,
                  cv=StratifiedKFold(5),
                  scoring='recall_macro')
    rfecv.fit(X_train, y_train)
    print('Time Elapse: {}'.format(time.time() - ticks))
    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    # confusion matrix
    y_pred = rfecv.predict(X_test)
    class_names = ['remission', 'hypomania', 'mania']
    plt.figure()
    plot_confusion_matrix(confusion_matrix(y_test, y_pred),
                          classes=class_names,
                          title='Confusion matrix without normalization')
    plt.figure()
    plot_confusion_matrix(confusion_matrix(y_test, y_pred),
                          classes=class_names,
                          normalize=True,
                          title='Confusion matrix')
Ejemplo n.º 19
0
    def execute(self):

        # create model
        estimator = LGBMRegressor(boosting_type='gbdt',
                                  objective='regression',
                                  metric='mae',
                                  num_iterations=10000,
                                  learning_rate=0.001,
                                  num_leaves=350,
                                  max_depth=9,
                                  min_data_in_leaf=100)

        selector = RFECV(estimator,
                         step=1,
                         cv=4,
                         scoring="neg_mean_squared_error",
                         verbose=-1)
        selector.fit(self.partitions.x_train, self.partitions.y_train)

        # # prediction
        # y_pred = selector.predict(self.partitions.x_test)

        # select only the best features
        selector.fit(self.partitions.x_train[:, selector.support_],
                     self.partitions.y_train)
        y_pred = selector.predict(self.partitions.x_test[:, selector.support_])

        # number of best features
        self.n_features = selector.n_features_

        # which categories are best
        self.best_features = selector.support_

        # rank features best (1) to worst
        self.feature_ranking = selector.ranking_

        return y_pred, self.partitions.y_test
Ejemplo n.º 20
0
    def recursive_feature_elimination_cv(config_learning, config_data):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data)
        combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy')
        rfecv.fit(x_train, y_train)

        feature_list = []

        for i, feature_name in enumerate(feature_names):
             if combination_methods[i] == 'both':
                 feature_list.append(feature_name)
                 feature_list.append(feature_name)
             else:
                 feature_list.append(feature_name)

        for i, name in enumerate(feature_list):
            output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n")

        output.close()

        predictions = rfecv.predict(x_test)

        return predictions
Ejemplo n.º 21
0
    # ----

    # Sane?
    assert X.shape[0] == chunks.shape[0], "X and chunks length mismatch"       
    assert np.sum(chunks == -1) == 0, "Chunks is malformed"  

# ----
# Classify
# ----

# ----
# Using RFE and linear SVM
clf = SVC(C=10, kernel="linear")
rfecv = RFECV(estimator=clf, step=1, cv=cv, scoring="accuracy")
rfecv.fit(X, y)
prediction = rfecv.predict(X)
print("Optimal feature number {0}".format(rfecv.n_features_))
print("Feature ranks {0}".format(rfecv.ranking_))
accs = accuracy_score(y, prediction)
#print(classification_report(y, prediction))
#print("Overall accuracy: {0}, Chance: {1}.".format(
#    np.round(accuracy_score(y, prediction), decimals=2), 
#    1.0/len(np.unique(y))))
# ----

# ----
# Using GradientBoostingClassifier
#clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
#        max_depth=1, random_state=0)
#accs = cross_val_score(clf, X, y=y, scoring="accuracy", cv=cv,
#        n_jobs=1, verbose=0,
Ejemplo n.º 22
0
def predictAndPlot(data, header, features, name):
    print "\n%s" % name

    # First reduce the data to relevant features.
    features_plus_date = np.hstack((0, features))
    analyzed_data = data[:, features_plus_date]

    # Remove rows with missing data.
    for i in range(len(analyzed_data[0])):
        analyzed_data = analyzed_data[analyzed_data[:, i] != '']

    # If it is a retention feature, skip the last X entries.
    if "retention" in name:
        if "1d" in name:
            retention_feature_linesSkipped = 3
        elif "3d" in name:
            retention_feature_linesSkipped = 7
        elif "7d" in name:
            retention_feature_linesSkipped = 15
        elif "14d" in name:
            retention_feature_linesSkipped = 29
        elif "28d" in name:
            retention_feature_linesSkipped = 57
        else:
            retention_feature_linesSkipped = 0
        analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :]

        # The second-last line is # votes. If smaller than 50, skip this entry.
        # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs]

    # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed.
    dates = analyzed_data[:, 0]

    # Set best model and best score default values.
    best_model = ""
    best_score = -100

    # Iterate through all models to obtain the best parameters and features via cross validation
    for model_type in list_of_models:
        # Get training data X and y.
        X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column)
        y = analyzed_data[:, -1].astype(float)

        model = define_model(model_type) # Set model parameters based on model_type

        # Perform differently depending on which model is used.
        # Random Forest has to be treated differently because it doesn't support RFECV.
        if model_type == "RF":
            to_be_used_threshold = "median"  # Default value. Will be overwritten.
            score = -100.

            # Loop through different thresholds. Use the one with the highest score.
            for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"):
                try:
                    # Use only the "model_threshold" best features.
                    model.fit(X, y)
                    X_new = model.transform(X, threshold=model_threshold)
                    header_new = model.transform(header[features][:-1], threshold=model_threshold)

                    # Fit the model again with reduced features X_new and return out of bag score.
                    model.fit(X_new, y)
                    rf_score = model.oob_score_

                    # I try to keep the amount of features as small as possible.
                    # The rf_score of a model with more features needs to be 2% better to justify more params.
                    # In some cases the score is negative so it also needs to be better overall.
                    if (rf_score > score * 1.02) and (rf_score > score):
                        score = rf_score
                        to_be_used_threshold = model_threshold
                except:
                    # Just a debug output.
                    print "There was an error at model threshold: %s" % model_threshold

            print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold)
        elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
            selector = RFECV(model)
            selector = selector.fit(X, y)
            header_new = header[features][:-1]
            score = selector.score(X, y)
            print "Score is %2.3f with model: %s" % (score, model_type)
        else:
            print "Something went wrong!"

        if score > best_score:
            best_score = score
            best_model = model_type

    print "Best score is %2.3f with model: %s" % (best_score, best_model)


    # Predict using the best model, parameters and features, obtained before.
    model_type = best_model
    model = define_model(model_type)

    if model_type == "RF":
        # In some rare cases the model does not work, because all features were discarded.
        # Therefore try to do it again without a threshold, that should always work (model_threshold).
        try:
            model.fit(X, y)
            X_new = model.transform(X, threshold = to_be_used_threshold)
            header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold)

            model.fit(X_new, y)
            prediction = model.predict(X_new)
            score = model.oob_score_
        except:
            print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold
            model.fit(X, y)
            prediction = model.predict(X)
            #score = model.oob_score_
            score = 0
    elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
        selector = RFECV(model)
        selector = selector.fit(X, y)
        header_new = header[features][:-1]
        prediction = selector.predict(X)
        score = selector.score(X, y)
    else:
        print "lol!"

    # Now derive the importances respectively feature coefficients.
    try:
        # This only works with "RF"
        importances = model.feature_importances_
        importances_list = np.vstack((importances, header_new))
        importances_list = np.transpose(importances_list)
        importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1]
    except:
        # This should work with all other models.
        try:
            X_new = selector.transform(X)
            header_new = selector.transform(header_new)
            model.fit(X_new, y)
            med_value = np.median(X_new, axis=0)
            med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0]
            importances = model.coef_ * np.median(X_new, axis=0)
            importances_list = np.vstack((importances, header_new))
            importances_list = np.transpose(importances_list)
            importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1]
        except:
            # If the above doesnt work, just give a blank output.
            importances_list = np.zeros((10, 2))

    score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score)

    plot_predictionVsActual(prediction, y, score)
    return prediction, y, dates, importances_list
Ejemplo n.º 23
0
def rfecv_classifier(method,
                     train_data,
                     train_class,
                     test_data,
                     CV_=3,
                     fraction_feat_to_keep=0.1,
                     LM_params=get_ML_parameters(),
                     save_model=False):
    n_orig_features = len(list(train_data))
    max_ratio_diff = 1.2
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        have_written_params_to_file = True
    # set classifier method

    clf = set_up_classifier(method, CV_, LM_params)

    # fit and predict based on whether cross validation is used
    if (CV_ > 1):
        step_elim = (1 - fraction_feat_to_keep) / CV_

        # Recursive feature elimination with Cross Validation
        # CV might have issues if data set classification is poorly balanced and can not split it properly
        try:
            rfecv = RFECV(estimator=clf,
                          step=step_elim,
                          cv=StratifiedKFold(n_splits=CV_, random_state=0),
                          scoring='accuracy')
            rfecv.fit(train_data, train_class)
            preds = rfecv.predict(test_data)

            current_fraction_features = rfecv.n_features_ / n_orig_features
            if (current_fraction_features * max_ratio_diff <
                    fraction_feat_to_keep):
                raise ValueError(
                    "Not enough features kept by RFECV defaulting to RFE")
        except ValueError:
            rfecv = RFE(estimator=clf,
                        step=step_elim,
                        n_features_to_select=int(fraction_feat_to_keep *
                                                 len(list(train_data))))
            rfecv.fit(train_data, train_class)
            preds = rfecv.predict(test_data)

        mask = list(rfecv.support_)
        features = train_data.columns
        features_selected = [
            features[i] for i in range(0, len(mask)) if mask[i]
        ]

        # sometimes RFECV does not eliminate enough features, so then lets run RFE to remove more if more than 20% over
        current_fraction_features = len(features_selected) / n_orig_features
        step_elim = (current_fraction_features - fraction_feat_to_keep) / CV_
        if (current_fraction_features >
                max_ratio_diff * fraction_feat_to_keep) and step_elim > 0:
            rfecv = RFE(estimator=clf,
                        step=step_elim,
                        n_features_to_select=int(fraction_feat_to_keep *
                                                 n_orig_features))
            rfecv.fit(train_data[features_selected], train_class)
            preds = rfecv.predict(test_data[features_selected])
            mask = list(rfecv.support_)
            features = train_data.columns
            features_selected = [
                features[i] for i in range(0, len(mask)) if mask[i]
            ]

    else:
        clf.fit(train_data, train_class)
        preds = clf.predict(test_data)

    return preds, features_selected, sum(mask)
Ejemplo n.º 24
0
feature1 = SVD1.fit_transform(feature1)
feature2 = SVD2.fit_transform(feature2)

feature1 = feature1[labels != 2]
feature2 = feature2[labels != 2]
labels = labels[labels != 2]

feature = np.hstack((feature1, feature2))

from sklearn.feature_selection import RFECV

sfk = cv.StratifiedKFold(labels, 10)
scores = []
for train, test in sfk:
    score = []
    train_set = feature[train]
    test_set = feature[test]
    clf = RFECV(
        LinearSVC(C=100),
        cv=cv.StratifiedKFold(labels[train], 10),
        scoring='f1')
    clf.fit(train_set, labels[train])
    pred = clf.predict(test_set)
    score.append(accuracy_score(labels[test], pred))
    score.append(precision_score(labels[test], pred))
    score.append(recall_score(labels[test], pred))
    score.append(f1_score(labels[test], pred))
    scores.append(score)
avg = np.average(scores, axis=0)
print avg
Ejemplo n.º 25
0
    if PLOT:
        plot_n_features_vs_score(rfecv.grid_scores_)

    # Testing the model
    print('>> Testing model\n')
    if DISJOINT_TESTING:
        TEST_DATA = pd.read_csv(TEST_DATA_PATH, sep='\t', index_col=0).values
        X_test = TEST_DATA[:, 0:-1]
        y_test = TEST_DATA[:, -1]

    # Normalizing the test data
    X_test = normalizer.transform(X_test)
    print(f'Cross validation : Stratified {K_FOLD}-Fold\n')
    print(f'Performance metric used for model optimization : "{SCORING}"\n')
    # Testing the model with the test set
    y_pred = rfecv.predict(X_test)
    # Printing model scores
    print_scores(y_test, y_pred)
    # Stopping the timer
    duration = time() - start
    print(
        'Operation took:', f'{duration:.2f} seconds.\n'
        if duration < 60 else f'{duration / 60:.2f} minutes.\n')
    print(
        f'\nProcess ended at :\n\nDate  :  {dt.today().strftime("%x")}\nTime  :  {dt.today().strftime("%X")}\n'
    )

    # Converting a selected section of the dataset_operations to a numpy array (based on best features)
    data_matrix = DATA[get_selected_features() + ['StepLabel']].values
    X = data_matrix[:, 0:-1]
    y = data_matrix[:, -1]
Ejemplo n.º 26
0
                                                    test_size=0.33,
                                                    random_state=42)
l1 = LogisticRegression()
l1.fit(X_train, y_train)
p1 = l1.predict(X_test)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
l2 = LogisticRegression()
rfecv = RFECV(estimator=l2, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X_train, y_train)
print(rfecv.transform(X_train)[:1, :])
print(X_train.head(1))
print('By comparing the two we find the feature not selected')
print('Number of best suited features using RFFECV')
print(rfecv.n_features_)
p2 = rfecv.predict(X_test)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_data = scaler.transform(X_train)
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(scaled_data)
xtrain_pca = pca.transform(scaled_data)
xtest_pca = pca.transform(scaler.transform(X_test))
l3 = LogisticRegression()
l3.fit(xtrain_pca, y_train)
p3 = l3.predict(xtest_pca)
df_comp = pd.DataFrame(pca.components_, columns=X.columns)
print('PCA components for the features')
print(df_comp)
Ejemplo n.º 27
0
Y=dataset.iloc[:,-1].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train ,y_test = train_test_split(X,Y,test_size = 0.2,random_state=0)

#Feature Scalling of training and testing data
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

##After testing everything we used support vector classifier with RFECV##
clfsvm = SVC(kernel = 'linear')
selected=RFECV(clfsvm,n_jobs=-1)
selected.fit(X_train,y_train)
y_pred = selected.predict(X_test)

#Accuracy check
c = 0
for i in range(X_test.shape[0]):
    if y_pred[i] != y_test[i]:
        c = c + 1
print((X_test.shape[0] - c)/X_test.shape[0]*100)


#Answer
#Importing datasets
test_set= pd.read_csv("test.csv")
y_test=test_set.values
#Feature Scaling
y_test = scaler.transform(y_test)
Ejemplo n.º 28
0
targets = data_train['TARGET']
train_data = data_train.drop(labels=['EID','TARGET'],axis=1)

#  划分样本集
train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66)

# 设置参数
xgb = XGBClassifier(n_estimators=300,max_depth=5,nthread=20,scale_pos_weight=4,learning_rate=0.07)
# 特征选择
rfecv = RFECV(estimator=xgb, step=10, cv=StratifiedKFold(3),n_jobs =20,
              scoring='roc_auc')
rfecv.fit(train_x, train_y)

pre_y = rfecv.predict_proba(test_x)[:,1]
pre_y_categ = rfecv.predict(test_x)
# 计算auc
fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y)
auc=metrics.auc(fpr, tpr)
f1 = metrics.f1_score(test_y,pre_y_categ)
print("AUC得分为:")
print(auc)
print('f1-score为:')
print(f1)
print("Optimal number of features :" )
print(rfecv.ranking_ )
print('n_features_')
print(rfecv.n_features_)
print('support_')
print(rfecv.support_)
total_time = time() - t0
Ejemplo n.º 29
0
X1 = parser.iloc[:, 3:len(parser.columns)].values  # [4,5,7,8,9]
Y1 = parser.iloc[:, 0].values
# splitting data set to training set and test set
X1_train, X1_test, Y1_train, Y1_test = non_shuffling_train_test_split(
    X1, Y1, test_size=0.25)

# feature scaling
sc_X = StandardScaler()
X1_train = sc_X.fit_transform(X1_train)
X1_test = sc_X.transform(X1_test)
# feature selection using recursive feature elimination & training classifer
classifier1 = RFECV(SVC(kernel="linear", random_state=0), scoring="accuracy")
# classifier1 = RFECV(LogisticRegression(random_state=0),scoring='accuracy')
classifier1.fit(X1_train, Y1_train)
# predict the test set result
Y1_pred = classifier1.predict(X1_test)

# to be used in part 2
tested_data, result_part1 = Y1_test, Y1_pred

####### performance of part 1

# confusion Matrix
cm = confusion_matrix(tested_data, result_part1)
print("confusion_matrix:\n", cm)
# accuracy
print("accuracy = ", accuracy_score(tested_data, result_part1))
# recall
print("recall = ", recall_score(tested_data, result_part1))
# precision
print("presicion = ", precision_score(tested_data, result_part1))
Ejemplo n.º 30
0
		ratio = live/len(testData[i])
		print("%%live: ",ratio, "| name: ", dataName[i])

	importances = rfe.ranking_
	indices = np.argsort(importances)
	print("Feature ranking:")

	for f in range(n_features):
	    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

##################
# RANDOM FOREST WITH RFE WITH CV

if (fit_data_rfe_cv):
	for i in range(0, len(testData)):
		pred = rfe_cv.predict(testData[i])
		live = sum(pred)
		ratio = live/len(testData[i])
		print("%%live: ",ratio, "| name: ", dataName[i])

	importances = rfe_cv.ranking_
	indices = np.argsort(importances)
	print("Feature ranking:")

	for f in range(n_features):
	    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

if(feat_roc):
	half = int(n_samples / 2)
	x,y = shuffle(x,y,random_state=random_state)
	X_train, X_test = x[0:half], x[half:-1]
data_USA_target = data_USA['target']
data_USA.drop(['num','id','target'],axis = 1, inplace = True)
data_USA = pd.get_dummies(data_USA, columns= ['cp','restecg','slope','thal','loc'])
data_std = Standardize(data_USA)
data_std['target'] = data_USA_target
print("Data preprocessed...")
data = data_std.as_matrix()
train_x, test_x, train_y, test_y = train_test_split(data[:, 0:-1], data[:,-1],train_size=0.75)
names = list(data_USA.columns.values)
print("Executing Recursive Feature Elimination in SVM...")
svc = SVC(kernel="linear", C=5)
rfecv  = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10),
              scoring='accuracy')
rfecv.fit(train_x, train_y)
Training_score = rfecv.score(train_x, train_y)
predicted= rfecv.predict(test_x)
accuracy = accuracy_score(test_y, predicted)
print("The support array \n",rfecv.support_)
print("The ranking array \n",rfecv.ranking_)
print(sorted(zip(map(lambda x: round(x, 4), rfecv.ranking_), names)))
print("Training Accuracy is ", Training_score)
print("Test Accuracy is ", accuracy)
print("The Cross-validation score :" ,max(rfecv.grid_scores_))
print("Optimal number of features : {}" .format(rfecv.n_features_))
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
def main(
        iterations=1,
        output_dir='output',
        one_hot_reform=True,
        rfecv_eval=True,
        deterministic=False,
        grid_search_eval=True,
        shuffle_holdout=True,
        plot_rfecv_gridscore=True,
        optimum_gbr_estimate = True,
        max_gbr_iterations = 5000,
        plot_all_gridscores=True,
        holdout_size=0.15,
        crossfolds=5,
        one_hot_reform_categories=ONE_HOT_REFORM_CATEGORIES_,
        database_path=DATABASE_PATH,
        target_data_column_name='depAttEff',
        gbr_parameter_grid_=GBR_PARAMETER_GRID_,
        gbr_initial_params=GBR_INITIAL_PARAMS_):

    # input database
    database_basename = os.path.basename(DATABASE_PATH)

    # output directory
    output_dir = os.path.join(output_dir, 'regressor')
    make_dirs(output_dir)

    # initialize predicted and holdout tracking
    rfecv_y_predicted_track = []
    rfecv_y_holdout_track = []

    # initialize score tracking
    score_track_mae = []
    score_track_r2 = []
    rfecv_gridscore_track = []

    # initialize feature tracking and ranking
    feature_track = []
    feature_rank = []

    training_data = pd.read_csv(database_path)
    target_data = training_data[target_data_column_name]

    # make sure to drop the target data from the training data
    training_data = training_data.drop([target_data_column_name], 1)

    # initialize the regressor with initial params
    clf = GradientBoostingRegressorWithCoef(**gbr_initial_params)

    if one_hot_reform:
        training_data, _, _ = one_hot_dataframe(
            training_data, one_hot_reform_categories, replace=True)

    for run in xrange(iterations):
        print "run: ", run+1
        y_all = np.array(target_data)
        x_all = training_data.as_matrix()

        if shuffle_holdout:
            random_state = _SEED if deterministic else None
            sss = cross_validation.ShuffleSplit(len(y_all),
                                                n_iter=1,
                                                test_size=holdout_size,
                                                random_state=random_state)

            for train_index, test_index in sss:
                x_train, x_holdout = x_all[train_index], x_all[test_index]
                y_train, y_holdout = y_all[train_index], y_all[test_index]

        '''The logic is to optimize the parameters for all the features before
		RFECV'''
        if grid_search_eval:
            if optimum_gbr_estimate:
                # initial params for optimum finding

                # determine minimum number of estimators with least overfitting

                opt_gbr = np.arange(max_gbr_iterations) + 0
                test_score = heldout_score(clf, x_train, y_train, max_gbr_iterations)
                test_score /= test_score[0]
                test_best_iter = opt_gbr[np.argmin(test_score)]
                print test_best_iter

                # triple the optimum number of estimators.
                gbr_parameter_grid_['n_estimators'] = [test_best_iter]

                # then implement grid search alg.
                grid_searcher = grid_search.GridSearchCV(estimator=clf,
                                                         cv=crossfolds,
                                                         param_grid=gbr_parameter_grid_,
                                                         n_jobs=-1)

                # call the grid search fit using the data
                grid_searcher.fit(x_train, y_train)

                # store and print the best parameters
                best_params = grid_searcher.best_params_

        else:
            ''' The logic is that if we don't do grid search, use initial
                    params as 'best' '''
            best_params = gbr_initial_params

        # re-initialize and fit with the "best params"
        clf = GradientBoostingRegressorWithCoef(**best_params)
        clf.fit(x_train, y_train)

        if rfecv_eval:
            rfecv = RFECV(
                estimator=clf,
                step=1,
                cv=crossfolds,
                scoring='mean_absolute_error')

            # perform rfecv fitting
            rfecv.fit(x_train, y_train)

            # track predicted y values
            rfecv_y_predicted = rfecv.predict(x_holdout)
            rfecv_y_predicted_track.append(rfecv_y_predicted)

            # track truth y_holdout values
            rfecv_y_holdout_track.append(y_holdout)

            # track grid score rankings
            rfecv_gridscore_track.append(rfecv.grid_scores_)

            # track MAE performance of estimtor to predict holdout
            score_track_mae.append(metrics.mean_absolute_error(
                rfecv_y_predicted, y_holdout))

            # track overall r2 performance to predict holdout
            score_track_r2.append(metrics.r2_score(
                rfecv_y_predicted, y_holdout))

            # create array of feature ranks (contains all featuers)
            feature_rank.append(rfecv.ranking_)
            feat_names = np.array(list(training_data), copy=True)

            # create array of only selected features
            rfecv_bool = np.array(rfecv.support_, copy=True)
            sel_feat = list(compress(feat_names, rfecv_bool))
            feature_track.append(sel_feat)

        if plot_rfecv_gridscore and rfecv_eval:
            plt.plot(rfecv_y_predicted, y_holdout, '+')
            plt.plot(y_holdout, y_holdout, 'r-')
            plt.show()

            plt.xlabel("Number of features selected")
            plt.ylabel("Cross validation score (MAE)")
            plt.plot(range(1, len(rfecv.grid_scores_) + 1),
                     rfecv.grid_scores_)
            plt.show()

    # Output used to plot the rank of each feature relatively. 
    feature_rank_df = pd.DataFrame(feature_rank)
    feature_rank_df.columns = feat_names 
    feature_rank_df = feature_rank_df.transpose()
    feature_rank_df.to_csv('feature_rank_df.csv')

    # Output used to plot only the best features
    feature_track = pd.DataFrame(feature_track)
    feature_track = feature_track.transpose()
    feature_track.to_csv('feature_track.csv')

    # overall r2 value for all runs
    overall_r2 = metrics.r2_score(
        np.array(rfecv_y_predicted_track).ravel(order='C'), np.array(
            rfecv_y_holdout_track).ravel(order='C'))

    # Output to plot the predicted y values 
    rfecv_y_predicted_track = pd.DataFrame(rfecv_y_predicted_track).transpose()
    rfecv_y_predicted_track.to_csv('rfecv_y_predicted_track.csv')

    # Output to plot the holdout y values (truth)
    rfecv_y_holdout_track = pd.DataFrame(rfecv_y_holdout_track).transpose()
    rfecv_y_holdout_track.to_csv('rfecv_y_holdout_track.csv')

    # Output used to plot the optimum model MAE 
    score_track_mae = pd.DataFrame(score_track_mae).transpose()
    score_track_mae.to_csv('score_track_mae.csv')
    print score_track_mae

    # Output used to plot the optimum model r2 
    score_track_r2 = pd.DataFrame(score_track_r2).transpose()
    score_track_r2.to_csv('score_track_r2.csv')

    # transpose dataframe for ease of viewing and plotting
    rfecv_gridscore_track = pd.DataFrame(rfecv_gridscore_track)
    rfecv_gridscore_track = rfecv_gridscore_track.transpose()
    rfecv_gridscore_track.to_csv('rfecv_gridscore_track.csv')

    if plot_all_gridscores:
        rfecv_gridscore_track.plot(kind='line')
        plt.show()
Ejemplo n.º 33
0
def main():
    
    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('allsongcat.p', 'rb'))
    #hcdf = pickle.load(open('hcdf_fv.p', 'rb'))
    
    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row
     
    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l=[allsongcat]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(int(labels[i]))


    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=3)
    cla = SVR(kernel="linear")
    selector = RFECV(cla, step=1, cv=3)
    selector = selector.fit(X,Y)

    scores = 0.0
    cm_all = np.zeros((10,10), dtype=np.int)
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        #cla.fit(X_train, y_train)
        predictions = selector.predict(X_test)
        scores += zero_one_loss(predictions, y_test)

        # Compute confusion matrix
        cm = confusion_matrix(y_test, predictions, labels =[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        np.set_printoptions(precision=2)
        #print(cm_all)
        cm_all = np.add(cm_all, cm)
    
    print scores/3
    plt.figure()
    plot_confusion_matrix(cm_all)

    plt.show()
df['SexN']=df['Sex']
df1['SexN']=df1['Sex']


enc=LabelEncoder()



df['SexN']=enc.fit_transform(df['Sex']) 
df1['SexN']=enc.fit_transform(df1['Sex'])



X_train=df[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] 
y_train=df['Survived']
X_test=df1[['Pclass','SibSp','Parch','Fare','AgeN','SexN']]
X_test1=df1[['PassengerId','Pclass','SibSp','Parch','Fare','AgeN','SexN']]
svc=SVC(kernel='linear')
#svc=DecisionTreeClassifier(criterion='entropy')
rfecv=RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 5),scoring='accuracy')
rfecv.fit(X_train,y_train)
predictions=rfecv.predict(X_test)
print rfecv.score(X_train,y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

finlist=zip(X_test1['PassengerId'],predictions)
with open("/Users/prakashchandraprasad/Desktop/datasets/Titanic/Decision_tree_titanic7.csv","wb") as f:
    writer=csv.writer(f)
    writer.writerow(["PassengerId","Survived"])
    writer.writerows(finlist)
Ejemplo n.º 35
0
#Validate
if (options.validate):
    y_true = train_labels.values[:,1].ravel().astype(int)
    validate.KFold(train.values[:,4:], y_true)
    #validate.KFold(train.drop('subject', axis=1).values, y_true)

X_train = train.values[:,4:]
X_test = test.values[:,4:]
pca = PCA(n_components=70)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print 'training'
output = "py_{0}.csv".format(submission_id)
clf = LinearRegression()
y_true = train_labels.values[:,1].ravel().astype(int)
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_true, 4),
              scoring='roc_auc')
rfecv.fit(X_train, y_true)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

preds = rfecv.predict(X_test)
submission['Prediction'] = preds
submission.to_csv(output,index=False)
print 'Done'
Ejemplo n.º 36
0
X_Perc = SelectPercentile(percentile=50).fit(X, Y)
X_selected = X_perc.transform(X)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfeLoR = RFE(LogisticRegression(solver='saga', max_iter=1000), 100)
#Sag model works well on large datasets but is sensitive to feature scaling. saga handles sparcity
rfeLoR.fit(X, Y)
rfeLoR.n_features_

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy')
m_RFERFC.fit(X, Y)  # returns model
X_RFERFC = m_RFERFC.predict(X)
m_RFERFC.score(X, Y)

from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
m_lasso = SelectFromModel(LassoCV())
m_lasso.fit(X, Y)
m_lasso.transform(X).shape
X_lasso = m_lasso.transform(X)
m_lasso.get_params()
mask = m_lasso.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
X.columns[mask]
#Using CV helps reduce selection bias due to the observations in the training set
Ejemplo n.º 37
0
for i in range(0, 6 * piece):
    col.append("max" + str(i + 1))
    col.append("mean" + str(i + 1))
    col.append("std" + str(i + 1))
train_target = train_data.iloc[:, -1]
train_data = train_data[col]

model = LogisticRegression(max_iter=20)
clf = RFECV(model, step=1, cv=5, n_jobs=-1)
clf = clf.fit(train_data, train_target)
pured_data = train_data.iloc[:, clf.support_]

model = LogisticRegression(max_iter=20)
clf = RFECV(model, step=1, cv=5, n_jobs=-1)
clf = clf.fit(pured_data, train_target)
pred = clf.predict(pured_data)
falsePositiveRate, truePositiveRate, thresholds = roc_curve(train_target, pred)

confu_mat = pd.crosstab(train_target,
                        pred,
                        rownames=['True'],
                        colnames=['Predicted'],
                        margins=True)
print("confusion matrix")
print(confu_mat)
print("parameters")
statLogitModel = sm.Logit(train_target, pured_data).fit_regularized()
print(statLogitModel.params)
print("P-values")
scores, pvalues = chi2(pured_data, train_target)
for i in range(len(pvalues)):
    f_statistic, p_value, _ = sm.stats.diagnostic.het_goldfeldquandt(
        y_test, X_test, idx=1, alternative='two-sided')
    print(p_value)

    fig = sm.graphics.qqplot(stud_resid, line='45')
    '''Recursive Feature Elimination with Cross-Validation'''
    # Scoring functions
    msle_func = make_scorer(mean_squared_log_error)
    mse_func = make_scorer(mean_squared_error)

    # Recursive Feature Elimination
    estimator = LinearRegression()
    selector = RFECV(estimator, cv=10)
    # selector = RFE(estimator, n_features_to_select=20)
    selector = selector.fit(X_train, np.log(y_train))
    y_pred = selector.predict(X_test)

    # Floor predictions at zero
    y_pred[y_pred < 0] = y_train.mean()
    y_pred = np.exp(y_pred)

    # Scoring
    rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
    rmsle_score = np.sqrt(mean_squared_log_error(y_pred, y_test))

    print('Selected features: {}'.format(X_train.columns[selector.support_]))
    print('\nRMSE: {}'.format(rmse_score))
    print('RMSLE: {}'.format(rmsle_score))
    '''Ridge Regression'''
    # Set the range of hyper-parameters to search
    params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001]}
Ejemplo n.º 39
0
# print test_X.shape, test_Y.shape

logistic_reg = LogisticRegression()
logistic_reg.fit(train_X, train_Y)
print logistic_reg.score(test_X_1, test_Y_1)
# test_Y = logistic_reg.predict(test_X)

# result.to_csv('result.csv', encoding='utf-8', index=False)
Svc = SVC()
Svc.fit(train_X, train_Y)
print Svc.score(test_X_1, test_Y_1)
# test_Y = Svc.predict(test_X)

model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
print model.score(test_X_1, test_Y_1)
# test_Y = model.predict(test_X)

rfecv = RFECV(estimator=model,
              step=1,
              cv=StratifiedKFold(train_Y, 2),
              scoring='accuracy')
rfecv.fit(train_X, train_Y)
print rfecv.score(test_X_1, test_Y_1)
test_Y = rfecv.predict(test_X)

passenger_id = full[891:].PassengerId
test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y})
print test.shape
test.to_csv('pred.csv', index=False)
Ejemplo n.º 40
0
# Build a classification task using 3 informative features
#X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
#                          n_redundant=2, n_repeated=0, n_classes=8,
#                         n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.
#svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
RF_rfecv = RFECV(estimator=modeltry,
                 step=1,
                 cv=StratifiedKFold(2),
                 scoring='accuracy')
RF_rfecv.fit(train_me_X, train_me_y)
features = RF_rfecv.get_support(indices=False)
RF_rfecv_predict = RF_rfecv.predict(test_me_X)
RF_rfecv_accuracy = metrics.accuracy_score(RF_rfecv_predict, test_me_y)

print(features)
print("Optimal number of features : %d" % RF_rfecv.n_features_)
print(RF_rfecv_accuracy)

# In[21]:

color_function = {
    0: "blue",
    1: "red"
}  # Here Red color will be 1 which means M and blue foo 0 means B
colors = data["diagnosis"].map(lambda x: color_function.get(
    x))  # mapping the color fuction with diagnosis column
pd.plotting.scatter_matrix(data[features_mean],
Ejemplo n.º 41
0
def main():

    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('allsongcat.p', 'rb'))
    #hcdf = pickle.load(open('hcdf_fv.p', 'rb'))

    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row

    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l = [allsongcat]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(int(labels[i]))

    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=3)
    cla = SVR(kernel="linear")
    selector = RFECV(cla, step=1, cv=3)
    selector = selector.fit(X, Y)

    scores = 0.0
    cm_all = np.zeros((10, 10), dtype=np.int)
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        #cla.fit(X_train, y_train)
        predictions = selector.predict(X_test)
        scores += zero_one_loss(predictions, y_test)

        # Compute confusion matrix
        cm = confusion_matrix(y_test,
                              predictions,
                              labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        np.set_printoptions(precision=2)
        #print(cm_all)
        cm_all = np.add(cm_all, cm)

    print scores / 3
    plt.figure()
    plot_confusion_matrix(cm_all)

    plt.show()
Ejemplo n.º 42
0
    elif rfecv.ranking_[i] == 8:
        print '8: feature ' + str(i) + ': ' + num_to_name[i]
    i += 1
'''
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
'''

#now we do prediction

#load test data
new_X_test, new_y_test = load_svmlight_file('test_svm_data.txt')

test_results = rfecv.predict(new_X_test)

print test_results

i = 0
with open('test_rest_names.txt', 'r') as f:
    for line in f.readlines():
        if test_results[i] == 2:
            print line + ' will fail in 3 years'
        else:
            print line + ' is staying alive'
        i += 1

Ejemplo n.º 43
0
#plug it into recursive feature elimination with 10-fold cross-validation and MSE scoring metric
rfecv = RFECV(estimator=clf_7, step=1, cv=KFold(10), scoring='neg_mean_squared_error')

#pipeline is gonna help us retrieve the feature names
#name your classifier and estimator whatever you want, and stick em in tuples
pipeline = Pipeline([
    ('rfe_cv',rfecv),
    ('clf',clf_7)
])

#fit that pipe, bro
pipeline.fit(X_train, y_train)

#how'd we do on the test set?
mse = mean_squared_error(y_test, rfecv.predict(X_test))
print("MSE: %.4f" % mse)

#how many features did we really need?
print("Optimal number of features : %d" % rfecv.n_features_)

#the .named_steps attribute from the pipeline can be indexed with whatever you named your RFECV estimator
#from there you use the .support_ attribute to help you get the feature names
#note: support_feat is just a boolean mask you can apply to the full array of features to get just the ones used by the model
support_feat = pipeline.named_steps['rfe_cv'].support_

#aliasing that full array of features
feat_names = np.array(list(gbr_df2.drop(['Overall Achievement Score','Overall Achievement Score Scaled','SPG Score Scaled'],axis=1).columns))

#and pulling out the feature names with boolean masking
feat_names[support_feat]
Ejemplo n.º 44
0
scaler = preprocessing.StandardScaler()
scaler.fit(explanatory_df2)
explanatory_df2 = pandas.DataFrame(scaler.transform(explanatory_df2), columns=explanatory_df2.columns)

from sklearn.preprocessing import Imputer

numeric_features = df2.ix[:, df2.dtypes != "object"]
imputer_object = Imputer(missing_values="NaN", strategy="median", axis=0)
imputer_object.fit(numeric_features)
numeric_features = pandas.DataFrame(imputer_object.transform(numeric_features), columns=numeric_features.columns)

from sklearn.feature_selection import RFECV
from sklearn import tree

rfe_cv.fit(explanatory_df2, response_series)
rfe_cv.predict(explanatory_df2)

print "Optimal number of features :{0} of {1} considered".format(rfe_cv.n_features_, len(explanatory_df2.columns))
print rfe_cv.grid_scores_

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (ROC_AUC)")
plt.plot(range(1, len(rfe_cv.grid_scores_) + 1), rfe_cv.grid_scores_)
plt.show()

features_used = explanatory_df.columns[rfe_cv.get_support()]
print features_used


#               IGNORE                  #