def test_f_regression_input_dtype():
    # Test whether f_regression returns the same value
    # for any numeric data_type
    rng = np.random.RandomState(0)
    X = rng.rand(10, 20)
    y = np.arange(10).astype(np.int)

    F1, pv1 = f_regression(X, y)
    F2, pv2 = f_regression(X, y.astype(np.float))
    assert_array_almost_equal(F1, F2, 5)
    assert_array_almost_equal(pv1, pv2, 5)
Esempio n. 2
0
def test_f_regression_input_dtype():
    """
    Test whether f_regression returns the same value
    for any numeric data_type
    """
    rng = np.random.RandomState(0)
    X = rng.rand(10, 20)
    y = np.arange(10).astype(np.int)

    F1, pv1 = f_regression(X, y)
    F2, pv2 = f_regression(X, y.astype(np.float))
    assert_array_almost_equal(F1, F2, 5)
    assert_array_almost_equal(pv1, pv2, 5)
Esempio n. 3
0
def score_features(X, y):
    f_regression(X, y)


#  F-test
# [(460.65647274586092, 'RM'),
#  (668.7235158135851, 'LSTAT'),
#  (179.57156528039974, 'PTRATIO'),
#  (632.59314407749389, 'RM * LSTAT'),
#  (1.7387467537056727, 'RM * PTRATIO'),
#  (533.63937869220888, 'RM * RM'),
#  (365.17851177390713, 'LSTAT * LSTAT'),
#  (748.91444579648237, 'PTRATIO * LSTAT'),
#  (186.86355836745318, 'PTRATIO * PTRATIO')]
Esempio n. 4
0
    def fit_model(self):
        #read data from csv
        df = pd.read_csv('data_wrangle/df_final.csv')

        #drop reviews that have no score
        df = df.dropna(subset = ['overall']) 
        df['overall'] = df['overall'].astype(int)

        #create dummy variables for chords_scale and chords_key
        dummy = pd.get_dummies(df['chords_scale'])
        df = pd.concat([df, dummy], axis=1)

        dummy = pd.get_dummies(df['chords_key'])
        df = pd.concat([df, dummy], axis=1)
        columns = ['spectral_complexity', 'average_loudness', 'dissonance', 'pitch_salience', 
        'dynamic_complexity','tuning_frequency', 
        'chords_strength', 'chords_changes_rate', 'bpm', 'danceability', 'beats_count', 'length', 'A', 'A#', 'B','C','C#','D','D#','E','F','F#','G','G#','minor','major']
        df = shuffle(df)
        y = df['overall']
        x = df[columns]

        # muuttujien P-arvojen tarkastelua. näitä käyttämällä n 1% parannus scoreen logregillä ja rndtreellä

        p_values = pd.DataFrame()
        p_values['column']= columns
        p_values['P']=feature_selection.f_regression(x, y)[1]
        p_values['F']=feature_selection.f_regression(x, y)[0]

        good_cols=[]
        for i,row in p_values.iterrows():
            if row.P < 0.05:
                print(row.column,row.P)
                good_cols.append(row.column)

        y2 = df['overall']
        x2 = df[good_cols]

        X_train, X_test, y_train, y_test = model_selection.train_test_split(x2,y2,test_size=0.2)

        #### rfc with best params ####
        class_weight = dict({1:0.996, 2:0.001, 3:0.001, 4:0.0015, 5:0.0005})
        # class_weight = dict({1:0.70, 2:0.15, 3:0.1, 4:0.049, 5:0.001})
        rtc_best = RandomForestClassifier(class_weight=class_weight, bootstrap=True, criterion= 'gini', max_depth= 30, 
        max_features='auto', min_samples_split=4, n_estimators= 300)

        ## export model ##
        model = rtc_best.fit(X_train,y_train)
        joblib.dump(model, self.filename) 
        return model
def test_f_regression_center():
    # Test whether f_regression preserves dof according to 'center' argument
    # We use two centered variates so we have a simple relationship between
    # F-score with variates centering and F-score without variates centering.
    # Create toy example
    X = np.arange(-5, 6).reshape(-1, 1)  # X has zero mean
    n_samples = X.size
    Y = np.ones(n_samples)
    Y[::2] *= -1.0
    Y[0] = 0.0  # have Y mean being null

    F1, _ = f_regression(X, Y, center=True)
    F2, _ = f_regression(X, Y, center=False)
    assert_array_almost_equal(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2)
    assert_almost_equal(F2[0], 0.232558139)  # value from statsmodels OLS
def test_f_regression_center():
    # Test whether f_regression preserves dof according to 'center' argument
    # We use two centered variates so we have a simple relationship between
    # F-score with variates centering and F-score without variates centering.
    # Create toy example
    X = np.arange(-5, 6).reshape(-1, 1)  # X has zero mean
    n_samples = X.size
    Y = np.ones(n_samples)
    Y[::2] *= -1.
    Y[0] = 0.  # have Y mean being null

    F1, _ = f_regression(X, Y, center=True)
    F2, _ = f_regression(X, Y, center=False)
    assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
    assert_almost_equal(F2[0], 0.232558139)  # value from statsmodels OLS
Esempio n. 7
0
def interaction_feature(feature, target):
    poly = PolynomialFeatures(degree=2, interaction_only=True)

    record = pd.DataFrame(
        poly.fit_transform(feature[['div_rank', 'record', 'streak', 'gb']]),
        index=feature.index,
        columns=[
            'bias', 'div_rank', 'record', 'streak', 'gb', 'div_rank*record',
            'div_rank*streak', 'div_rank*gb', 'record*streak', 'record*record',
            'streak*record'
        ])

    record_int = record[[
        'div_rank*record', 'div_rank*streak', 'div_rank*gb', 'record*streak',
        'record*record', 'streak*record'
    ]]

    run = pd.DataFrame(
        poly.fit_transform(feature[['runs', 'runs_ma', 'runs_pg']]),
        index=feature.index,
        columns=[
            'bias', 'runs', 'runs_ma', 'runs_pg', 'runs*runs_ma',
            'runs*runs_pg', 'runs_ma*runs_pg'
        ])
    runs_int = run[['bias', 'runs*runs_ma', 'runs*runs_pg', 'runs_ma*runs_pg']]

    time = pd.DataFrame(
        poly.fit_transform(feature[['time', 'innings', 'runs_allowed']]),
        index=feature.index,
        columns=[
            '1', 'time', 'innings', 'runs_allowed', 'time*innings',
            'time*runs_allowed', 'innings*runs_allowed'
        ])
    time_int = time[[
        'time*innings', 'time*runs_allowed', 'innings*runs_allowed'
    ]]

    new_features = pd.concat([feature, time_int, runs_int, record_int], axis=1)

    feature_p = pd.DataFrame(index=feature.columns,
                             columns=['f_score', 'p_value'])
    feature_p['f_score'] = f_regression(feature, target)[0]
    feature_p['p_value'] = f_regression(feature, target)[1]

    kept_features = feature_p[feature_p['p_value'] < 0.05].index
    unkept_features = feature_p[feature_p['p_value'] > 0.05]

    return new_features, new_features[kept_features], unkept_features
Esempio n. 8
0
def uniFeatureReg(index=0, taskID='filesReg'):
    _, myTrain, myVal = dataEncoding(index, taskID)

    for name in myTrain.columns:
        if (not (myTrain[name].dtype == 'O')):
            myTrain[name] = pre.minmax_scale(myTrain[name].astype('float'))
    return f_regression(myTrain, myVal)[1]
Esempio n. 9
0
def run_linear_model(model_class,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     num_folds=None,
                     alpha=None,
                     l1_ratio=None,
                     print_results=False):

    if model_class.__module__.split('.')[-1] == 'base':
        model_stats_dict['model_type'] = 'OLS'
        model = model_class()
    elif model_class.__module__.split('.')[-1] == 'ridge':
        model_stats_dict['model_type'] = 'ridge'
        model = model_class(alpha=alpha)
    elif (model_class.__module__.split('.')[-1]
          == 'coordinate_descent') and (l1_ratio is None):
        model_stats_dict['model_type'] = 'lasso'
        model = model_class(alpha=alpha)
    elif (model_class.__module__.split('.')[-1]
          == 'coordinate_descent') and (l1_ratio is not None):
        model_stats_dict['model_type'] = 'elastic_net'
        model = model_class(alpha=alpha, l1_ratio=l1_ratio)
    else:
        print(f'Unrecognized model class: {model_class}')
        return None

    # Train model
    model.fit(X_train, y_train)
    F, pval = f_regression(X_train, y_train)
    model_stats_dict['train_stats']['r-squared'] = [
        model.score(X_train, y_train)
    ]
    model_stats_dict['train_stats']['features'] = X_train.columns.tolist()
    model_stats_dict['train_stats']['coeffs'] = model.coef_
    model_stats_dict['train_stats']['F-stat'] = F
    model_stats_dict['train_stats']['pval'] = pval

    # Perform cross-validation
    if num_folds:
        cv_scores = cross_val_score(model, X_train, y_train, cv=num_folds)
        model_stats_dict['cv_stats']['mean_r-squared'] = [np.mean(cv_scores)]

    # Evaluate predictions on test set
    y_pred = model.predict(X_test)

    # Save model datasets
    model_stats_dict['test_data']['y_test'] = y_test
    model_stats_dict['test_data']['y_pred'] = y_pred

    # Save model statistics
    model_stats_dict['test_stats']['r-squared'] = [model.score(X_test, y_test)]
    model_stats_dict['test_stats']['mse'] = [mse(y_test, y_pred)]
    model_stats_dict['test_stats']['rmse'] = [rmse(y_test, y_pred)]
    model_stats_dict['test_stats']['mae'] = [mae(y_test, y_pred)]
    model_stats_dict['test_stats']['mape'] = [mape(y_test, y_pred)]

    if print_results:
        print_prediction_metrics(y_test, y_pred)
Esempio n. 10
0
def fun(File1, File2, File3, File4, File5):
    start = time.time()
    file1 = r'E:/study/资料/数据/'
    ds = gdal.Open(file1 + File1)       # 打开文件
    im_width = ds.RasterXSize       # 列数
    im_height = ds.RasterYSize      # 行数
    im_bands = ds.RasterCount       # 波段数
    band1 = ds.GetRasterBand(1)
    img_datatype = band1.DataType
    data1 = np.full((39, im_height, im_width), 1.0)
    data2 = np.linspace(1980, 2019, 39)
    data3 = np.full((im_height, im_width), 1.0)
    data4 = np.full((im_height, im_width), 1.0)
    for year in range(1980, 2019):
        file2 = file1 + File2 + str(year) + File3
        ds = gdal.Open(file2)
        img_data = ds.ReadAsArray()     # 读取整幅图像转化为数组
        data1[year - 1980] = img_data

    for x in range(0, im_height):
        for y in range(0, im_width):
            x_data = data1[:, x, y]
            y_data = data2
            x_data = x_data.reshape(-1, 1)
            regr = LinearRegression()
            regr.fit(x_data, y_data)
            y_pred = regr.predict(x_data)
            r2score = r2_score(y_data, y_pred)
            pvalue = f_regression(x_data, y_data)[1][0]
            data3[x][y] = r2score
            data4[x][y] = pvalue
    Write(File4, data3, ds, img_datatype)
    Write(File5, data4, ds, img_datatype)
    end = time.time()
    print(end - start)
Esempio n. 11
0
 def f_regression_check(self, X, Y, names):
     print ('calc f_regression importance {}'.format(os.getpid()))
     f, pval  = f_regression(X, Y, center=True)
     f[np.isnan(f)] = 0
     #self.ranks["Corr."] =
     print ('calc f_regression finished !')
     return rank_to_dict(f, names)
Esempio n. 12
0
    def printMetrics(self):

        print()
        print("=================================================")
        print("=========== METRICS =============================")
        print("Features (put in Pandas df): ", self.feature_names)
        print('Mean squared error: %.2f' %
              mean_squared_error(self.y_test, self.y_pred))
        print("Explained variance score: ",
              explained_variance_score(self.y_test, self.y_pred))

        mi = mutual_info_regression(self.x_train, self.y_train)
        mi = mi / np.max(mi)

        fr, pval = f_regression(self.x_train, self.y_train,
                                center=True)  # center?

        print("Mutual Information: ", mi)
        #fr = fr / np.max(fr)
        print("f_regression: ", fr)
        print("pval: ", pval)
        print("R2 score: ", self.r2_score)
        print("=========== END METRICS =========================")
        print("=================================================")
        print()
    def run(names, X, Y, filepath):
        filepath = filepath.replace(".pkl", "_{}.pkl".format(current_thread().name))

        def glob_ranking(folder):
            ranking = {}

            for f in glob.iglob("{}*.pkl".format(folder)):
                ranking.update(load_cache(f))

            return ranking

        while True:
            timestamp_start = time.time()
            done_ranking = glob_ranking(filepath)

            key, model, coef, order = FeatureProfile.queue.get()
            if key in done_ranking:
                log("{} was done before".format(key), INFO)
            else:
                if key == "Corr.":
                    f, pval = f_regression(X, Y, center=True)
                    FeatureProfile.ranking[key] = FeatureProfile.normalization(f, names)
                else:
                    model.fit(X, Y)
                    FeatureProfile.ranking[key] = FeatureProfile.normalization(np.abs(getattr(model, coef)), names, order)

                save_cache(copy.deepcopy(FeatureProfile.ranking), filepath)

            timestamp_end = time.time()
            log("Cost {:.4f} secends to finish {}".format(time.time() - timestamp_start, key), INFO)

            FeatureProfile.queue.task_done()
Esempio n. 14
0
    def f_test(self, X, y, ci=0.9):
        """Return the signficant columns of X in a linear regression against y with confidence interval ci
        Requires both X and y to be a data frame
        """
        sig_cols = []
        pvals = []
        fscores = []
        r2_scores = []
        lm = linear_model.LinearRegression()

        for f in list(X.columns):
            pval = feature_selection.f_regression(X[f], y)
            if pval[1][0] < (1 - ci):
                sig_cols.append(f)
                fscores.append(pval[0][0])
                pvals.append(pval[1][0])
                lm.fit(X[[f]], y)
                r2_scores.append(metrics.r2_score(y, lm.predict(X[[f]])))

        return pd.DataFrame({
            'feature': sig_cols,
            'p-value': pvals,
            'F score': fscores,
            'r2': r2_scores
        })
def best_features(n_features, X_train, y_train, X_test, y_test):
    feature_importance = f_regression(train[continuous_cols],
                                      train[insurance_loss])[0]
    idx = np.argsort(-feature_importance)[:n_features]
    lr = LinearRegression()
    lr.fit(X_train.iloc[:, idx], y_train)
    return lr.score(X_test.iloc[:, idx], y_test)
Esempio n. 16
0
def regression_feature_selection(train_features, train_labels, test_features,
                                 percent):
    # print(train_features[0,:])
    # print(train_labels)
    ff = np.zeros((train_features.shape[1], train_labels.shape[1]))
    for p in range(train_labels.shape[1]):
        ff[:, p], _ = f_regression(train_features, train_labels[:, p])
        # print(ff)
    ff = np.nanmean(ff, axis=1)
    # print(ff)
    features_to_keep = np.argsort(ff)[-int(ff.shape[0] / percent):]

    print(len(features_to_keep))
    print(len(features_to_keep))
    # threshold = int(np.nanmean(ff)*2)
    # features_to_keep = largest_indices(ff,10)

    new_train_features = np.zeros(
        (train_features.shape[0], len(features_to_keep)))
    new_test_features = np.zeros(
        (test_features.shape[0], len(features_to_keep)))

    for i, f in enumerate(features_to_keep):
        new_train_features[:, i] = train_features[:, f]
        new_test_features[:, i] = test_features[:, f]
    return new_train_features, new_test_features
    def correlations(self):
        params = [
            "Title", "Year", "Genre", "Director", "Writer", "Actors",
            "Language", "Country", "Runtime", "BoxOffice"
        ]
        df = pd.DataFrame(pd.read_csv(imdb_csv_path))
        year = np.array(list(map(float, df['Year']))).reshape(-1, 1)
        boxoffice = np.array(list(map(float, df['BoxOffice'])))
        year_score = feature_selection.f_regression(year, boxoffice)
        print(year_score)

        def one_hot(string, boxoffice):
            enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
            enc.fit(np.array(list(set(df[string]))).reshape(-1, 1))
            try:
                feature = enc.transform(np.array(df[string]).reshape(
                    -1, 1)).toarray()
            except ValueError:
                print(string)
                exit()
            feature = np.array([
                ''.join(re.findall(r'\d+', g)) for g in list(map(str, feature))
            ]).reshape(-1, 1)
            score = feature_selection.f_regression(feature, boxoffice)
            return score

        # cal_scores
        score = dict()
        for p in params[:-1]:
            score[p] = one_hot(p, boxoffice)
        print(score)
Esempio n. 18
0
    def find_next_best(dt_kmer, y, selected_kmers, to_be_selected_kmers, consider_shift=True):
        """
        perform stepwise model selection while preventing to add a motif similar to the
        already selected motifs.
        """
        F, pval = f_regression(dt_kmer[to_be_selected_kmers], y)
        kmer = to_be_selected_kmers.pop(pval.argmin())
        selected_kmers.append(kmer)

        def select_criterion(s1, s2, consider_shift=True):
            if hamming_distance(s1, s2) <= 1:
                return False
            if consider_shift and hamming_distance(s1[1:], s2[:-1]) == 0:
                return False
            if consider_shift and hamming_distance(s1[:-1], s2[1:]) == 0:
                return False
            return True

        to_be_selected_kmers = [ckmer for ckmer in to_be_selected_kmers
                                if select_criterion(ckmer, kmer, consider_shift)]
        if len(to_be_selected_kmers) == 0:
            return selected_kmers
        else:
            # regress out the new feature
            lm = LinearRegression()
            lm.fit(dt_kmer[selected_kmers], y)
            y_new = y - lm.predict(dt_kmer[selected_kmers])
            return find_next_best(dt_kmer, y_new, selected_kmers, to_be_selected_kmers, consider_shift)
Esempio n. 19
0
def linreg_evaluate_model(X: pd.DataFrame, y: pd.Series,
                          y_pred: np.ndarray) -> None:
    """
    print evaluation metrics for linear regression model
    """
    y_label = y.columns[0]
    X_labels = X.columns

    print("Univariate Linear Regression Model Evaluation")
    meanse = mean_squared_error(y, y_pred)
    print(f"\tMean SE: {meanse:.3f}")

    meanae = mean_absolute_error(y, y_pred)
    print(f"\tMean AE: {meanae:.3f}")

    print()

    medianae = median_absolute_error(y, y_pred)
    print(f"\tMedian AE: {medianae:.3f}")

    print()

    r2 = r2_score(y, y_pred)
    print(f"\t{r2:.2%} of the variance in {y_label} can be explained "
          f"by {X_labels.tolist()}.")
    print()

    print("P-VALUE")
    f_vals, p_vals = f_regression(X, y)
    print(f"\tTrain: {p_vals[0]:.3}")
Esempio n. 20
0
    def select(self, dataframe: 'pd.DataFrame', y_column: str) -> list:
        '''
            Selecting the most important columns
        :param dataframe: pandas DataFrame
             Data Frame on which the algorithm is applied
        :param y_column: str
             The column name of the value that we what to predict
        :return: list
            The list of features that are selected by the algorithm as the best one
        '''
        # Defining the list with names of columns except the predicted one
        X_columns = [col for col in dataframe.columns if col != y_column]

        # Creating the F and p-value history dictionaries
        self.F_history = {}
        self.p_value_history = {}
        for col in X_columns:
            self.F_history[col] = []
            self.p_value_history[col] = []

        # Defining the feature states
        feature_state = list(np.ones(len(X_columns)))
        while True:
            self.iter += 1

            # Extracting the selected columns
            X_cols = self.bin_to_cols(feature_state, X_columns)
            X = dataframe[X_cols].values
            y = dataframe[y_column].values

            # Choosing different strategy depending whatever it is a classification or regression.
            if self.classification:
                F_vals, p_vals = f_classif(X, y)
            else:
                F_vals, p_vals = f_regression(X, y)
            index = 0
            for col in X_columns:
                if col in X_cols:
                    self.F_history[col].append(float(F_vals[index]))
                    self.p_value_history[col].append(float(p_vals[index]))
                    index += 1
                else:
                    self.F_history[col].append(-1)
                    self.p_value_history[col].append(-1)

            # Choosing the max value of p-value
            max_PValue = max(p_vals)

            # Erasing the column with the p-value equal with the max value of the p-value, if the max value is
            # higher than significance level
            if max_PValue > self.significance_level:
                for j in range(len(X_cols)):
                    if p_vals[j].astype(float) == max_PValue:
                        feature_state[X_columns.index(X_cols[j])] = 0
            else:
                break

        # Returning the chose columns.
        self.choosed_cols = self.bin_to_cols(feature_state, X_columns)
        return self.choosed_cols
Esempio n. 21
0
    def work_sequence(self):

        # is it OK to do the intersect and the linear regression 23 extra times?


        # clear
        G, y, snp_name, _ = load_intersect(self.snp_reader, self.pheno_fn)

        # compute linear regression
        _, p_values_lin = f_regression(G, y, center=True)

        # set up empty return structures
        #self.rs = snp_name
        #self.p_values = -np.ones(len(snp_name))

        # get chr names/id
        chr_ids = self.snp_reader.pos[:,0]

        #self.pos = self.snp_reader.pos

        #loco = [[range(0,5000), range(5000,10000)]]
        loco = LeaveOneChromosomeOut(chr_ids, indices=True)

        if len(loco) is not self.chrom_count :  raise Exception("The snp reader has {0} chromosome, not {1} as specified".format(len(loco),self.chrom_count))

    
        for i, (train_snp_idx, test_snp_idx) in enumerate(loco):
            if i == 0:
                result = {"p_values":-np.ones(len(snp_name)),
                          "p_values_lin": p_values_lin,
                          "rs":snp_name,
                          "pos":self.snp_reader.pos}
            else:
                result = None
            yield lambda i=i, train_snp_idx=train_snp_idx,test_snp_idx=test_snp_idx,result=result,G=G,y=y: self.dowork(i,train_snp_idx,test_snp_idx,result,G,y)  # the 'i=i',etc is need to get around a strangeness in Python
Esempio n. 22
0
def summary(X_vars, y_var):
    """
    Mutual information (MI) [R176] between two random variables is a non-negative value, which measures the dependency between the variables. 
    It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
    """
    cols = X_vars.columns.tolist()
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_vars, y_var)
    model = SelectFromModel(lsvc, prefit=True)
    labels = [cols[x] for x in model.get_support(indices=True) if x]
    X_new = model.transform(X_vars)
    mi = mutual_info_regression(X_vars, y_var)
    if np.max(mi) != 0:
        mi /= np.max(mi)
    f_test, f_test_pvals = f_regression(X_vars, y_var)
    if np.max(f_test) != 0:
        f_test /= np.max(f_test)
    t_test = [math.sqrt(elem) for elem in f_test]
    print(cols)
    print("F-test")
    print(list(f_test))
    print("F-test pvals")
    print(list(f_test_pvals))
    print("t-test")
    print(list(t_test))
    print("t-test pvals")
    print(list(f_test_pvals))
    print("mutual information")
    print(list(mi))
    print("Linear SVC Feature reduction")
    print(cols, "->", labels)
Esempio n. 23
0
    def fit(self, X, y=None):
        if not self.float_k is None:
            Xn = X[:, :self.float_k]
        else:
            Xn = X
        self.k_ = Xn.shape[1]
        pvals = []
        self.logger.info(f'Xn.shape:{Xn.shape},Xn:{Xn}')
        for fn in self.transform_funcs.values():
            TXn = fn(Xn)
            try:
                F, p = f_regression(TXn, y)
            except:
                self.logger.exception(f'error doing f_regression')
                p = np.array([10000.] * TXn.shape[1])
            pvals.append(p[None, :])

        pval_stack = np.concatenate(pvals, axis=0)  # each row is a transform
        bestTloc = np.argsort(pval_stack, axis=0)[0, :]
        Ts = list(self.transform_funcs.keys())
        self.bestTlist = [Ts[i] for i in bestTloc]
        self.logger.info(f'bestTlist:{self.bestTlist},')
        T_s = list(self.transform_funcs.keys())
        self.best_T_ = [T_s[loc] for loc in bestTloc]
        return self
Esempio n. 24
0
def SignificanceMatrix(data):
    col = data.columns
    colTypes = [ check_type(x) for x in data.dtypes ]
    relationMatrix = pd.DataFrame(index=col,columns=col)

    for i in range(len(col)):
        for j in range(i, len(col)):
            if i==j:
                pval = 1
                relationMatrix.loc[col[i],col[j]] = pval
            else:
                tempdata = data[[col[i],col[j]]]
                tempdata = tempdata.dropna(axis=0)   #Remeber to add warning where missing data is removed
                col1 = tempdata[col[i]]
                col2 = tempdata[col[j]].ravel()
                # print tempdata.dtypes
                # print colTypes[i],colTypes[j]
                if colTypes[i] == colTypes[j]:
                    if colTypes[i] == "continuous":
                        # print "both cont"
                        pval = np.round(feature_selection.f_regression(pd.DataFrame(col1),col2)[1][0],3)
                    else:
                        pval = chisq_independence(tempdata[col[i]],tempdata[col[j]])                        
                else:
                    if colTypes[i] == "continuous":
                        pval = np.round(feature_selection.f_classif(pd.DataFrame(col1),col2)[1][0],3)
                    else:
                        pval = np.round(feature_selection.f_classif(pd.DataFrame(col2),col1)[1][0],3)
                relationMatrix.loc[col[i],col[j]] = pval
                relationMatrix.loc[col[j],col[i]] = pval

    return relationMatrix.fillna("NAN")
Esempio n. 25
0
def linear_trend(datay, *datax):
    """
    :param datay:
    :param datax:
    :return: 返回趋势r2score和显著性水平pvalue
    """
    DATA = []
    for i, temp in enumerate(datax):
        temp = temp.reshape(-1, 1)
        if i == 0:
            x_data = temp
        else:
            x_data = np.hstack((x_data, temp))
    if (len(x_data[np.isnan(x_data)]) > 0) | (len(datay[np.isnan(datay)]) > 0):
        return np.full((i + 3), np.nan)
    else:
        regr = LinearRegression().fit(x_data, datay)
        for i in regr.coef_:
            DATA.append(i)
        y_pred = regr.predict(x_data)
        r2score = r2_score(datay, y_pred)
        pvalue = f_regression(datay.reshape(-1, 1), y_pred)[1][0]
        DATA.append(r2score)
        DATA.append(pvalue)
        DATA = np.array(DATA)
        return DATA
Esempio n. 26
0
    def _select_k_best_inorder(self,
                               k_best,
                               X_train,
                               y_train,
                               X_pred,
                               ar_cutoff=52):

        X_train_select = X_train[:,
                                 ar_cutoff:] if ar_cutoff is not None else X_train
        X_pred_select = X_pred[:,
                               ar_cutoff:] if ar_cutoff is not None else X_pred

        f_scores, null = f_regression(X_train_select, y_train)
        k_best_f_scores = np.argsort(f_scores)[::-1][:k_best]

        X_train_best_sorted = X_train_select[:, k_best_f_scores]
        X_pred_best_sorted = X_pred_select[:, k_best_f_scores]

        if ar_cutoff is not None:
            X_train_best_sorted = np.hstack(
                (X_train[:, :ar_cutoff], X_train_best_sorted))
            X_pred_best_sorted = np.hstack(
                (X_pred[:, :ar_cutoff], X_pred_best_sorted))

        return X_train_best_sorted, X_pred_best_sorted
Esempio n. 27
0
def f_regression_feature_analyse(_df, store_item_nbrs):
	df = _df.copy()
	p.figure()
	X, y = ut.get_train_data(df)
	feature_list = X.columns.values[2:]
	importance_value = np.zeros(len(feature_list))
	total = 0
	for sno, ino in store_item_nbrs:
		if(sno == 35):
			continue
		X_1 = X[(X.store_nbr == sno) & (X.item_nbr == ino)]
		X_1 = X_1.drop(['store_nbr','item_nbr'], axis=1)
		y_1 = y[X_1.index.values]
		features = feature_list
		F, _ = f_regression(X_1.values, y_1.values)
		importance = get_importance(np.nan_to_num(F))
		print(importance)
		# to draw the each (sno, ino) pic need to uncomment underline code
		# draw_feature_importance(importance, features, sno, ino)
		importance_value += len(X_1.index) * np.array(importance)
		total = total + len(X_1.index)
		print(importance_value)

	importance_value = importance_value / total
	draw_total_average_importance(importance_value, feature_list)
def example_one():

    np.random.seed(0)
    X = np.random.rand(1000, 3)
    y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

    f_test, _ = f_regression(X, y)
    f_test /= np.max(f_test)

    mi = mutual_info_regression(X, y)
    mi /= np.max(mi)

    plt.figure(figsize=(15, 5))
    for i in range(3):
        plt.subplot(1, 3, i + 1)
        plt.scatter(X[:, i], y, edgecolor='black', s=20)
        plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
        if i == 0:
            plt.ylabel("$y$", fontsize=14)
        plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
                  fontsize=16)

    plt.show()

    return X[:, 0], X[:, 1], X[:, 2], y
def FilterInLoading(np_genotype, np_phenotype):
    """

    This function is for filtering low quality varaint. Before modeling each subset of genotype features, two criteria were adopted to exclude low quality data. The first criterion is that the genotype frequency of a feature should exceed 5%, where the genotype frequency means the proportion of genotype among the total samples in the dataset. The second criterion is regarding the association between the feature and the phenotype. We used χ2 test to estimate the association between the feature and the phenotype, and the p-value should be smaller than 0.01.

    Args:
        np_genotype (ndarray): 2D array containing genotype data with `int8` type
        np_phenotype (ndarray): 2D array containing phenotype data with `float` type

    Returns:
        (ndarray): np_genotype
        
            2D array containing genotype data with `int8` type
    
    """

    try:
        ### variance check (detect variance < 0.05)
        sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95)))
        np_genotype = sk_variance.fit_transform(np_genotype)

        ### f regression feature selection
        np_fRegression = -np.log10(
            f_regression(np_genotype.astype(int),
                         np_phenotype[:, -1].astype(float))[1])
        np_selectedIdx = np.array([x > 2 for x in np_fRegression])
        np_genotype = np_genotype[:, np_selectedIdx]

        return np_genotype.shape[1]

    except:
        return 0
Esempio n. 30
0
def forward_select(dataframe, threshold=10):
    """
    前 向 特 征 选 择 (Forward Feature Selection )
    前向特征选择其实就是反向特征消除的相反过程,即找到能改善模型性能的最佳特征,⽽不是删除弱影响特征。它背后的思路如下所述:
    选择⼀个特征,⽤每个特征训练模型n次,得到n个模型。
    选择模型性能最佳的变量作为初始变量。
    每次添加⼀个变量继续训练,重复上⼀过程,最后保留性能提升最⼤的变量。
    ⼀直添加,⼀直筛选,直到模型性能不再有明显提⾼。
    Args:
        dataframe:
    Returns:
        [注]:前向特征选择和反向特征消除耗时较久,计算成本也都很高,所以只适用于输入变量较少的数据集。

    """
    from sklearn.feature_selection import f_regression

    # 返回⼀个数组,其中包括变量F值和每个F对应的p值。
    ffs = f_regression(dataframe.drop(axis=1, inplace=False, columns='label'),
                       dataframe.label)

    variable = []

    for i in range(0, len(df.columns) - 1):

        if ffs[0][i] >= threshold:  # 在这⾥,我们选择F值⼤于10的变量
            variable.append(df.columns[i])
    return variable
Esempio n. 31
0
def run_knn_regressor(X_train, X_test, y_train, y_test, k=5, weights='uniform', print_results=False):

    knn = KNeighborsRegressor(n_neighbors=k, weights=weights)

    # Train the model
    knn.fit(X_train, y_train)
    F, pval = f_regression(X_train, y_train)
    model_stats_dict['train_stats']['r-squared'] = [knn.score(X_train, y_train)]
    model_stats_dict['train_stats']['features'] = X_train.columns.tolist()
    model_stats_dict['train_stats']['F-stat'] = F
    model_stats_dict['train_stats']['pval'] = pval

    # Evaluate predictions on test set
    y_pred = knn.predict(X_test)

    # Save model datasets
    model_stats_dict['test_data']['y_test'] = y_test
    model_stats_dict['test_data']['y_pred'] = y_pred

    # Save model statistics
    model_stats_dict['test_stats']['r-squared'] = [knn.score(X_test, y_test)]
    model_stats_dict['test_stats']['mse'] = [mse(y_test, y_pred)]
    model_stats_dict['test_stats']['rmse'] = [rmse(y_test, y_pred)]
    model_stats_dict['test_stats']['mae'] = [mae(y_test, y_pred)]
    model_stats_dict['test_stats']['mape'] = [mape(y_test, y_pred)]

    if print_results:
        print_prediction_metrics(y_test, y_pred)
Esempio n. 32
0
def linearCoe_hq(X, y, cut=0.05):
    ## linear model coefficient based=====
    from sklearn.feature_selection import f_regression
    f, pval  = f_regression(X, y, center=True)
    subs = np.array([False] * X.shape[1])
    subs[pval < cut] = True
    return(subs)
Esempio n. 33
0
def getModel(**args):
    formatting = Pipeline([("other", Extractor(getFormattingFeatures)),
                           ("scaler", StandardScaler())])
    question = Pipeline([("extract", Extractor(getFirstWordDict)),
                         ("counter", DictVectorizer())])
    topics = Pipeline([
        ("extract", Extractor(lambda x: {t["name"]: 1
                                         for t in x["topics"]})),
        ("counter", DictVectorizer())
    ])
    none_dict = None
    if args["none_var"] == True:
        none_dict = {"none": 1}
    else:
        none_dict = {}
    ctopic = Pipeline([("extract",
                        Extractor(lambda x: {x["context_topic"]["name"]: 1}
                                  if x["context_topic"] else none_dict)),
                       ("counter", DictVectorizer())])
    topic_question = Pipeline([
        ("content",
         FeatureUnion([("question", question), ("topics", topics),
                       ("ctopic", ctopic)])),
    ])
    """
  others = Pipeline([
    ("extract", Extractor(lambda x: [1 if x["anonymous"] else 0])),
    ("scaler",  StandardScaler())
  ])
  """
    followers = Pipeline([
        ("extract",
         Extractor(lambda x: [
             math.log(
                 sum(t["followers"] for t in x["topics"]) + args["smoother"])
         ])), ("scaler", StandardScaler())
    ])
    k_means = KMeans(n_clusters=96,
                     random_state=20,
                     n_init=3,
                     max_iter=8,
                     tol=1e-3)
    label_binarizer = LabelBinarizer(sparse_output=True)
    svr = LinearSVR(C=0.04, loss="squared_epsilon_insensitive")
    model = Pipeline([
        ("union",
         FeatureUnion([("content", topic_question), ("formatting", formatting),
                       ("followers", followers)])),
        ("union2",
         FeatureUnion([("transductive",
                        Pipeline([("k_means", PredictTransformer(k_means)),
                                  ("label_binarizer",
                                   OmitTargetTransformer(label_binarizer))])),
                       ("pass_through", PassThroughTransformer())])),
        ("f_sel",
         SelectKBest(score_func=lambda X, y: f_regression(X, y, center=False),
                     k=args["all_K"])), ("svr", svr)
    ])
    return model
Esempio n. 34
0
def getModel(**args):
  formatting = Pipeline([
    ("other", Extractor(getFormattingFeatures)),
    ("scaler", StandardScaler())
  ])
  question = Pipeline([
    ("extract", Extractor(getFirstWordDict)),
    ("counter", DictVectorizer())
  ])
  topics = Pipeline([
    ("extract", Extractor(lambda x: {t["name"] : 1 for t in x["topics"]})),
    ("counter", DictVectorizer())
  ])
  none_dict = None
  if args["none_var"] == True:
    none_dict = {"none" : 1}
  else:
    none_dict = {}
  ctopic = Pipeline([
    ("extract", Extractor(lambda x: {x["context_topic"]["name"] : 1} if x["context_topic"] else none_dict)),
    ("counter", DictVectorizer())
  ])
  topic_question = Pipeline([
    ("content", FeatureUnion([
      ("question", question),
      ("topics", topics),
      ("ctopic", ctopic)
    ])),
  ])
  """
  others = Pipeline([
    ("extract", Extractor(lambda x: [1 if x["anonymous"] else 0])),
    ("scaler",  StandardScaler())
  ])
  """
  followers = Pipeline([
    ("extract", Extractor(lambda x: [math.log(sum(t["followers"] for t in x["topics"]) + args["smoother"])])),
    ("scaler", StandardScaler())
  ])
  k_means = KMeans(n_clusters = 96, random_state = 20, n_init = 3, max_iter = 8, tol = 1e-3)
  label_binarizer = LabelBinarizer(sparse_output = True)
  svr = LinearSVR(C = 0.04, loss = "squared_epsilon_insensitive")
  model = Pipeline([
    ("union", FeatureUnion([
      ("content", topic_question),
      ("formatting", formatting),
      ("followers", followers)
    ])),
    ("union2", FeatureUnion([
      ("transductive", Pipeline([
        ("k_means", PredictTransformer(k_means)),
        ("label_binarizer", OmitTargetTransformer(label_binarizer))
      ])),
      ("pass_through", PassThroughTransformer())
    ])),
    ("f_sel", SelectKBest(score_func = lambda X, y : f_regression(X, y, center = False), k = args["all_K"])),
    ("svr", svr)
  ])
  return model
Esempio n. 35
0
def f_test_univariate_selection(X, y):
    """Given data instances (X) and their corresponding targets (y),
    this method indicates which features of X are most correlated with y
    according to an f_test based univariate feature selection
    """
    f_test, _ = f_regression(X, y)
    f_test /= np.max(f_test)
    print('ranked features -- f-test', f_test)
def getTopFeaturesF(df, predictor):
    y = df[predictor]
    X = df.drop([predictor], axis=1)._get_numeric_data()
    columns = X.columns
    f, pval = f_regression(X, y, center=True)
    f[np.isnan(f)] = 0
    #    return rank_to_dict(f, columns)
    return (dict(zip(columns, f)), dict(zip(columns, pval)))
Esempio n. 37
0
def feature_correls(X, y, coords):
    """ Compute correlations between each feature and the label, correcting
    for spatial auto-correlaiton. """
    fvals, pvals = f_regression(X, y)
    for i in range(len(X[0])):
        r, p = spatial.spatial_correlation(X[:, i], y, coords)
        pvals[i] = p
    return fvals, pvals
Esempio n. 38
0
def linear_randomForest_Regression(data, target, network):
    lr = linear_model.LinearRegression(normalize = True)
    rfr = RandomForestRegressor(n_estimators = 30,max_depth = 12, max_features='auto')
    kf = KFold(len(target), n_folds=10, shuffle=True, random_state=None)
    RMSE_LINEAR = []
    RMSE_RFR = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rfr = rfr.fit(data_train, target_train)
        rmse_linear = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        RMSE_LINEAR.append(rmse_linear)
        rmse_rfr = sqrt(np.mean((rfr.predict(data_test) - target_test) ** 2))
        RMSE_RFR.append(rmse_rfr)
    
    #scores = cross_validation.cross_val_score(rfr,data_test, target_test.ravel, cv=10)
    #print np.mean(scores)
    
    F, pval = f_regression(data_test, lr.predict(data_test))
    print(np.mean(RMSE_RFR))
    test_times = np.arange(1,11)
    plt.figure()
    plt.plot(test_times, RMSE_LINEAR, label = "RMSE in linear regression with 10-fold cv")
    plt.plot(test_times, RMSE_RFR, label = "RMSE in random forest regression with 10-fold cv")
    plt.ylim(0.0, 0.12)
    #plt.title("RMSE comparison between linear regression and random forest regression")
    plt.xlabel("cross validation times")
    plt.ylabel("RMSE")
    plt.legend()

    network['predicted_lr'] = lr.predict(data);
    network['predicted_rfr'] = rfr.predict(data);
    network_time_target = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["Size of Backup (GB)"].sum()
    network_time_predict_lr = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["predicted_lr"].sum() 
    network_time_predict_rfr = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["predicted_rfr"].sum()
    time = np.arange(1, len(network_time_target)+1)

    plt.figure()
    plt.scatter(time, network_time_target, s = 15, color = 'red', label = "Actual values over time")
    plt.scatter(time, network_time_predict_lr, s = 15, color = 'green', label = "predicted values with linear model")
    plt.xlabel('Time')
    plt.ylabel('Size of backup(GB)')
    plt.ylim(-2,12)
    plt.legend()

    plt.figure()
    plt.plot(time[0:120], network_time_predict_rfr[0:120], label = "predicted values with random forest tree model")
    plt.legend()

    plt.figure()
    plt.scatter(lr.predict(data), lr.predict(data) - target, label = "residual VS fitted values")
    plt.xlabel("fitted values")
    plt.ylabel("residual")
    plt.legend()
    plt.ylim(-0.8,0.4)
    plt.show() 
    return RMSE_LINEAR
Esempio n. 39
0
def perform_univariate_linear_regression_tests(X, y):
    f_test, p_values = f_regression(X, y)
    f_test /= np.max(f_test)
    print('+'*79)
    print('F1-test on log duration: {0:.4f}, activeness: {1:.4f}, mean: {2:.4f}, std: {3:.4f}, '
          'min: {4:.4f}, 25th: {5:.4f}, median: {6:.4f}, 75th: {7:.4f}, max: {8:.4f}'.format(*f_test))
    print('F1 p-value on log duration: {0:.4f}, activeness: {1:.4f}, mean: {2:.4f}, std: {3:.4f}, '
          'min: {4:.4f}, 25th: {5:.4f}, median: {6:.4f}, 75th: {7:.4f}, max: {8:.4f}'.format(*p_values))
    print('+' * 79)
def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    f_test, r = f_regression(X, y, center=True)
    logger.info(f_test)
    logger.info(r)
    return
def test_f_regression():
    # Test whether the F test yields meaningful results
    # on a simple simulated regression problem
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0)

    F, pv = f_regression(X, y)
    assert_true((F > 0).all())
    assert_true((pv > 0).all())
    assert_true((pv < 1).all())
    assert_true((pv[:5] < 0.05).all())
    assert_true((pv[5:] > 1.e-4).all())

    # again without centering, compare with sparse
    F, pv = f_regression(X, y, center=False)
    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
Esempio n. 42
0
    def test_f_regression(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.feature_selection.f_regression()
        expected = fs.f_regression(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
def _k_best_indeces(data, targets, selection_method, k):
    """get indices for the k best features depending on the scores"""
    assert k > 0
    if selection_method == 'linear':
        scores, _ = f_regression(data, targets)
    elif selection_method == 'forest':
        rfr_sel = RandomForestRegressor(compute_importances=True, random_state=0)
        scores = rfr_sel.fit(data, targets).feature_importances_
    assert not (scores < 0).any()
    assert len(scores) >= k
    scores[np.isnan(scores)] = 0
    return np.argsort(scores)[-k:]
def test_f_regression():
    """
    Test whether the F test yields meaningful results
    on a simple simulated regression problem
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    F, pv = f_regression(X, Y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.0e-4).all()
Esempio n. 45
0
def scalePCAcorrelate(df_numerical, df_w_mdata, metadata_cols, transformed):
    rv = df_numerical.shape[0]
    if transformed:
        X_std2 = df_numerical.values.T
    else:
        X_std2 = StandardScaler().fit_transform(df_numerical.values.T)

    rows_n, cols_n = X_std2.shape
    print "\nPerforming PCA"
    pca2 = PCA(n_components=100, random_state=42)
    pca2.fit(X_std2)
    no1 = pca2.explained_variance_ratio_[0]
    no2 = pca2.explained_variance_ratio_[1]
    print "Top two components explain {} and {} of variance.".format(no1, no2)
    all_cors, p_comp_n, exp_vars, corr_ps = [], [], [], []
    all_pvals, p_comp_nF, exp_vars2 = [], [], []
    for mdata in metadata_cols:
        md_arr = np.array(df_w_mdata[mdata])
        raw_corrs = [ss.pearsonr(pca2.components_[i, :], md_arr) for i in range(100)]
        corrs, c_pvals = zip(*raw_corrs)
        
        if not np.all(np.isfinite(md_arr)):
            print "Replacing {} not finite # with 0".format((~np.isfinite(md_arr)).sum())        
            md_arr[~np.isfinite(md_arr)] = 0
                   
        pvals = [f_regression(pca2.components_[i, :].reshape(rv, 1), md_arr)[1][0] for i in range(100)]
        all_pvals.append(np.array(pvals).min())
        all_cors.append(np.array(corrs).max())
        pca_comp_no = np.argmax(np.array(corrs))
        corr_ps.append(np.array(c_pvals)[pca_comp_no])
        pca_comp_no2 = np.argmin(np.array(pvals))
        p_comp_n.append(pca_comp_no+1)
        p_comp_nF.append(pca_comp_no2+1)
        exp_vars.append(pca2.explained_variance_ratio_[pca_comp_no])
        exp_vars2.append(pca2.explained_variance_ratio_[pca_comp_no2])
    data_ = np.vstack((all_cors, p_comp_n, exp_vars, corr_ps)).T
    data_2 = np.vstack((all_pvals, p_comp_nF, exp_vars2)).T

    colset = ['Correlation', 'Component', 'Explained Variance', 'P-value']
    colset2 = ['Pvalue', 'Component_F', 'Explained Variance_F']
    to_return = pd.DataFrame(data=data_, index=metadata_cols, columns=colset)
    f_to_return = pd.DataFrame(data=data_2, index=metadata_cols, columns=colset2)
    f_to_return.sort_values(['Component_F', 'Pvalue'], 
                          ascending=[True, True],
                          inplace=True)
    to_return.sort_values(['Component', 'Correlation'], 
                          ascending=[True, False],
                          inplace=True)
    final_return = to_return[to_return.Correlation.notnull()]
    final_f_return = f_to_return[f_to_return.Pvalue.notnull()]
    return final_return, final_f_return
Esempio n. 46
0
def process_window_dir(window_dir, model, features):
    print 'Processing', window_dir
    alphas = [0.025]
    only_feature_selection = False
    X, y = load_files(window_dir, features)
    X = scale(X)

    from sklearn.feature_selection import SelectKBest, f_regression

    featureSelector = SelectKBest(score_func=f_regression,k=20)
    featureSelector.fit(X,y)
    print 'Selected features', [FEATURE_NAMES[i] for i in list(featureSelector.get_support(indices=True))]

    if only_feature_selection:
        baseline_mean(X,y)
        baseline_zero(X,y)

        F, pval = f_regression(X, y)
        for i,f in enumerate(F):
            if i < len(FEATURES):
                name = FEATURES[i]
            else:
                name = NLP_FEATURES[i-11]
            print 'F-Statistic for %s is %f with p-value %f' % (name, f, pval[i])

        return None

    else:
        scores = []
        print 'sum of y is %d' % sum(y)
        clf = model()
        clf.fit(X[:30], y[:30])
        print 'small score is %g' % clf.score(X[30:60], y[30:60])

        # K-fold cross_validation
        kf = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            for alpha in alphas:
                clf = model()
                clf.set_params(alpha=alpha)
                clf.fit(X_train, y_train)
                score = clf.score(X_test, y_test)
                scores.append(score)
                print '{} score is {} when alpha is {}'.format(model.__name__, score, alpha)
    return scores
Esempio n. 47
0
def linear_regression(data, target):
    lr = linear_model.LinearRegression(normalize = True)
    kf = KFold(len(target), n_folds=10, shuffle=True, random_state=None)
    RMSE_LINEAR = []
    for train_index, test_index in kf:
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = target[train_index], target[test_index]
        lr.fit(data_train, target_train)
        rmse_linear = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2))
        RMSE_LINEAR.append(rmse_linear)
    
    #scores = cross_validation.cross_val_score(rfr,data_test, target_test.ravel, cv=10)
    #print np.mean(scores)
    
    F, pval = f_regression(data_test, lr.predict(data_test))
    print(pval)
    
    test_times = np.arange(1,11)
    plt.figure()
    plt.plot(test_times, RMSE_LINEAR, label = "RMSE in linear regression with 10-fold cv")
#    plt.ylim(0.0, 0.12)
    #plt.title("RMSE comparison between linear regression and random forest regression")
    plt.xlabel("cross validation times")
    plt.ylabel("RMSE")
    plt.legend()

    predicted = lr.predict(data);
    index = np.arange(1, len(predicted)+1)

    plt.figure()
    plt.scatter(index, target, s = 15, color = 'red', label = "Actual")
    plt.scatter(index, predicted, s = 15, color = 'green', label = "Fitted")
    plt.xlabel('Index')
    plt.ylabel('MEDV')
    plt.legend()


    plt.figure()
    plt.scatter(predicted,predicted-target,label = "residual VS fitted values")
    plt.xlabel("fitted values")
    plt.ylabel("residual")
    plt.legend()
#    plt.ylim(-0.8,0.4)
    plt.show() 
    return RMSE_LINEAR
Esempio n. 48
0
def f_regression_select(X, y, maxf=300, pvals=True, names=None, verbose=0,
                        old_idx_sel=None):
    "Select features using f_regression"
    if names is None:
        names = ["f_%d" % (i+1) for i in range(X.shape[1])]
    if not old_idx_sel:
        old_idx_sel = range(X.shape[1])
    f = f_regression(X, y, center=False)
    # (F-value, p-value, col, name)
    a = [(f[0][i], f[1][i], old_idx_sel[i], names[i])
         for i in range(X.shape[1])]
    if pvals:
        a = [e for e in a if e[1] < 0.05]
    a = sorted(a, reverse=True)
    idx_sel = [e[2] for e in a[:maxf]]
    if verbose > 0:
        b = a[:maxf]

        def out():
            if min(maxf, len(b)) > 100:
                print("F_select(%d):" % len(b), b[:90], "...", b[-10:],
                      file=sys.stderr)
            else:
                print("F_select(%d):" % len(b), b[:maxf], file=sys.stderr)

        def out2():
            print("F_select(%d):" % len(b), file=sys.stderr)

            def pr(m1, m2):
                for i in range(m1, m2):
                    row = b[i]
                    print("%10s %10.2f %15g %10d" % (row[3],
                          row[0], row[1], row[2]), file=sys.stderr)
            n = min(len(b), maxf)
            m = 90 if n > 100 else n
            pr(0, m)
            if n > 100:
                print("...", file=sys.stderr)
                pr(len(b)-10, len(b))
        if verbose > 1:
            out2()
        else:
            out()
    return np.asarray(idx_sel, dtype=int)
Esempio n. 49
0
def RankFeatures(X,Y,names):
    Y = Y.reshape(len(Y),)
    ranks = {};
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear.reg"] = rank_to_dict(np.abs(lr.coef_), names);
     
    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
     
    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
     
     
    rlasso = RandomizedLasso(alpha=0.04)
    rlasso.fit(X, Y)
    ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
     
    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X,Y)
    ranks["RFE"] = rank_to_dict([float(i) for i in rfe.ranking_], names, order=-1)
     
    rf = RandomForestRegressor()
    rf.fit(X,Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
     
     
    f, pval  = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names) 
    r = {}
    for name in names:
        r[name] = float(str(round(np.mean([ranks[method][name] for method in ranks.keys()]), 3)));
    print(r);
    methods = sorted(ranks.keys());
    ranks["Mean"] = r;
    methods.append("Mean");
    print ("\t%s" % "\t".join(methods));
    
    for name in names:
        print ("%s\t%s" % (name, "\t".join(map(str, 
                             [ranks[method][name] for method in methods]))))
Esempio n. 50
0
	def provideSuggestion(self):
		y = str(self.comboBox.currentText())
		xList = self.headerName
		xList.remove(y)


		ddf = self.data[xList]
		#x=["COST","Gender",'Age','Education']
		#y=["CLICKS"]
		vardict = LazyDict()
		categorical = []
		nonCategorical = []
		for i in xList:
			if self.runnable:
				if ddf[i].dtype=="object":
					categorical.append(i)
				else:
					vardict.keylist([i], i)
					nonCategorical.append(i)
		df = self.data[nonCategorical]

		for j in categorical:
			dummy_b = pd.get_dummies(ddf[j],prefix=j)
			dummy_columns = dummy_b.columns
			cols = list(dummy_columns[1:len(dummy_columns)])
			vardict.keylist(cols, j)
			df[cols] = dummy_b[dummy_columns[1:len(dummy_columns)]]

		variables =  list(df.columns)
		X = df.as_matrix()
		Y = self.data[y].as_matrix()
		F, pval = feature_selection.f_regression(X, Y)
		final_variables = []
		for i in range(0,len(pval)):
			if(pval[i]<0.05):
				if vardict[variables[i]] not in final_variables:
					final_variables.append(vardict[variables[i]])

		self.SelectedVariables = final_variables
		self.nonSelectedVariables = [x for x in self.headerName if x not in final_variables]
		print self.SelectedVariables
		print self.nonSelectedVariables
		self.createNonselectedTable()
		self.createSelectedTable()
Esempio n. 51
0
def test_bias():
    """
    make sure we get the same result for setting C=unitvec
    """

    S, y = get_example_data()
    C = np.ones((len(y),1))

    from sklearn.feature_selection import f_regression

    F1, pval1 = f_regression(S, y, center=True)
    F2, pval2 = f_regression_cov(S, C, y)
    F3, pval3 = f_regression_cov_alt(S, C, y)

    # make sure values are the same
    np.testing.assert_array_almost_equal(F1, F2)
    np.testing.assert_array_almost_equal(F2, F3)
    np.testing.assert_array_almost_equal(pval1, pval2)
    np.testing.assert_array_almost_equal(pval2, pval3)
Esempio n. 52
0
def getTopFeatures(train_x, train_y, n_features=100):
    f_val, p_val = f_regression(train_x,train_y)
    f_val_dict = {}
    p_val_dict = {}
    for i in range(len(f_val)):
        if math.isnan(f_val[i]):
            f_val[i] = 0.0
        f_val_dict[i] = f_val[i]
        if math.isnan(p_val[i]):
            p_val[i] = 0.0
        p_val_dict[i] = p_val[i]
    
    sorted_f = sorted(f_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True)
    sorted_p = sorted(p_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True)
    
    feature_indexs = []
    for i in range(0,n_features):
        feature_indexs.append(sorted_f[i][0])
    
    return feature_indexs
Esempio n. 53
0
def select_bestwords(D, y, nmax = 100, is_classif=True):
    """ Select nmax best correleted words in D (list of dicts) 
        with goal = y
    """
    y = np.asarray(y)
    v = DictVectorizer(sparse=True)
    try:
        X = v.fit_transform(D)
    except ValueError:
        logger.warning("===Except*** in select_bestwords D:%d y:%d",len(D),len(y))
        return (set([]))
    if is_classif:
        f=f_classif(X,y)
    else:
        f=f_regression(X,y)
    names = v.get_feature_names()
    # (F-value, p-value, word)
    a = [(f[0][i], f[1][i], names[i]) 
            for i in range(len(names))]
    a = sorted([e for e in a if e[1]<0.05], reverse=True)
    logger.debug("select_bestwords:%s",a[:16])
    top = set([ e[2] for e in a[:nmax] ])
    return top
Esempio n. 54
0
def train_f_selection(features_filename, targets_filename, model_name):
	""" Does feature selection using an F-test and fits a Ridge model. """
	print('Training model', model_name)
	
	# Read features and targets
	Xs, y = read_inputs(features_filename, targets_filename)
	
	# Set regularization parameters and number of features
	regularization_params = {'alpha': np.logspace(2.5, 5, 25)}
	ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000]
	
	# Initialize containers for weights and bias
	num_features = Xs.shape[1]
	num_outputs = y.shape[1]
	weights = np.zeros((num_outputs, num_features))
	bias = np.zeros(num_outputs)
	
	# And for bookkeeping
	best_models = {'R2': np.full(num_outputs, -np.inf), 
				   'alpha': np.zeros(num_outputs), 'k': np.zeros(num_outputs)}
	
	# Over every output element
	for i in range(num_outputs):
		y_i = y[:, i]
		
		# Train feature selector
		_, p_values = f_regression(Xs, y_i)
		sorted_indices = p_values.argsort()
		
		# Number of features with p < 0.01 and p < 0.001
		p_value_ks = [(p_values < 0.01).sum(), (p_values < 0.001).sum()]
		p_value_ks = [x for x in p_value_ks if x!=0]
		   
		# Train models with different number of features
		for k in sorted(ks + p_value_ks):
			# Select best k features
			selected_features = sorted_indices[:k]  
			Xs_select = Xs[:, selected_features]
			
			# Train model (searching for best regularization parameter)
			cv = GridSearchCV(Ridge(), regularization_params, cv=5, n_jobs=-1).fit(Xs_select, y_i)
			model = cv.best_estimator_
			
			# If best model yet, store it
			if cv.best_score_ > best_models['R2'][i]:
				weights[i, :] = 0
				weights[i, selected_features] = model.coef_
				bias[i] = model.intercept_
				best_models['R2'][i] = cv.best_score_
				best_models['alpha'][i] = model.alpha
				best_models['k'][i] = k		
				
		# Report and save checkpoint
		if i%100 == 0:
			print(i+1, 'out of', num_outputs)
			print('R2:', best_models['R2'][:i])
			print('Alphas:', best_models['alpha'][:i])
			print('Ks:', best_models['k'][:i])
						
			print('Saving checkpoint...')
			checkpoint_name = model_name + '_' + str(i) + '.h5'
			save_linear_model(checkpoint_name, weights, bias)

	# Print final cross-validation results
	print('Final results')
	print(i+1, 'out of', num_outputs)
	print('R2:', best_models['R2'])
	print('Average R2:', best_models['R2'].mean()) 
	print('Alphas:', best_models['alpha'])
	print('Ks:', best_models['k'])
	print('Average K:', best_models['k'].mean())
		
	# Save model
	save_linear_model(model_name + '.h5', weights, bias)
Esempio n. 55
0
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import svm


train_feature,train_target=prj4.feature_seclection(file_str='./tweet_data/tweets_#patriots.txt',start_year=2015,start_month=1,start_day=17,start_hour=16,
                        end_year=2015, end_month=2, end_day=1, end_hour=16)
                        
                   
test_feature,test_target,startmon,startday,starthour=prj4.sample_seclection (file_str='./test_data/sample5_period1.txt')


new_train_feature= SelectKBest(f_regression, k=10).fit_transform(train_feature, train_target)
[F,p_value]=f_regression(train_feature, train_target)
   
[a,b]=np.shape(new_train_feature)
[m,n]=np.shape(train_feature)
index=[]

for i in range(b):
    for j in range(n):
        same=list(set(new_train_feature[:,i]==train_feature[:,j]))

        if same[0]==True:
            index.append(j)
new_test_feature=test_feature[:,0]
for i in index[1:]:
    new_test_feature=np.column_stack((new_test_feature,test_feature[:,i]))
    
 def _get_column_f_regression_scores(self):
   misc.start('_get_column_f_regression_scores')
   scores = feature_selection.f_regression(self.X_no_nan, self.y)[0]
   misc.stop('_get_column_f_regression_scores')
   return scores
    # a better choice of alpha:
    # Stop the user warnings outputs- they are not necessary for the example
    # as it is specifically set up to be challenging.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        lars_cv = LassoLarsCV(cv=6).fit(X, y)

    # Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
    trees = ExtraTreesRegressor(100).fit(X, y)
    # Compare with F-score
    F, _ = f_regression(X, y)

    plt.figure()
    for name, score in [('F-test', F),
                        ('Stability selection', clf.scores_),
                        ('Lasso coefs', np.abs(lars_cv.coef_)),
                        ('Trees', trees.feature_importances_),
                        ]:
        precision, recall, thresholds = precision_recall_curve(coef != 0,
                                                               score)
        plt.semilogy(np.maximum(score / np.max(score), 1e-4),
                     label="%s. AUC: %.3f" % (name, auc(recall, precision)))

    plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
             label="Ground truth")
    plt.xlabel("Features")
Esempio n. 58
0
        w = min(W,dx)
        image(temppath,imgx,imgy,width=w)
        imgy = imgy + dy + 20
        os.remove(temppath)
        size(W, HEIGHT+dy+40)
else:
    def pltshow(mplpyplot):
        mplpyplot.show()
# nodebox section end


np.random.seed(0)
X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

plt.figure(figsize=(15, 5))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X[:, i], y, edgecolor='black', s=20)
    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    if i == 0:
        plt.ylabel("$y$", fontsize=14)
    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
              fontsize=16)
# plt.show()
Esempio n. 59
0
 def score(self, X, y):
     f, p = skl_fss.f_regression(X, y)
     return f