Esempio n. 1
0
def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20):
    n = n_alphas*n_ks
    cv_mean = np.empty(n)
    cv_std = np.empty(n)
    regressors = pd.DataFrame()

    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 

    itt_counter = 0
    print 'size n_a: %d n_k: %d' %(n_a, n_k)
    for i in range (0, n_a):
    	print 'reg. column : %d' %(i*n_k)
    	temp_string = 'alpha=%f' %alphas[i*n_k]
    	print temp_string
    	print regressors.shape
    	df_temp = pd.DataFrame()
        print 'computing for alpha = %f' %(alphas[n_ks*i])
        X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k])
        regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1)
        for j in range(0, n_k):
            print 'i:%d, j:%d' %(i, j)
            print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j])
            print 'X_lasso shape:' 
            print X_lasso.shape
            cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv)
            itt_counter = itt_counter + 1
            print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j)
    return cv_mean, cv_std, regressors
Esempio n. 2
0
def initialize():
    images, labels = load_mnist_data()

    binarizer = Binarizer().fit(images)
    images_binarized = binarizer.transform(images)

    knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
    knn.fit(images_binarized, labels)

    return knn
def cv_mean_std_array(X, y, alphas, n_a, cv=20):
    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 
    cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a)
    
    for i in range (0, n_a):
    	print 'computing for alpha=%f' %alphas[i]
        cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i])
        print 'successfully computed iteration %d' %i
    return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
def binarizeMatrix(dataMatrix, threshold):
    """
    Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
    """

    binarizer = Binarizer(threshold=threshold)

    dataMatrix = binarizer.fit_transform(dataMatrix)

    return dataMatrix
def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, 0]])

    for init in (np.array, sp.csr_matrix, sp.csc_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)
        X_bin = binarizer.transform(X)
        assert_equal(type(X), type(X_bin))

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)
def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Esempio n. 7
0
def wine_quality_white():
    # white wine quality dataset

    filename = '../../data/raw/mldata/winequality-white.csv'

    # The data corresponds to the 11 first column of the csv file
    data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
    # Read the label
    # We need to binarise the label using a threshold at 4
    bn = Binarizer(threshold=4)
    label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
    # We need to inverse the label -> 1=0 and 0=1
    label = np.ravel(np.abs(label - 1))
    
    np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
 def fit(self, X, y=None):
     """
     Обучает бинаризатор на данных
     """
     # print("Fitting binarizer...")
     methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
     if self.method not in methods:
         raise ValueError("Method should be one of {0}".format(", ".join(methods)))
     X = check_array(X, accept_sparse=['csr', 'csc'])
     if issparse(X):
         X = X.tocsc()
     if self.method in Binarizer._UNSUPERVISED_METHODS:
         self._fit_unsupervised(X)
         self.joint_thresholds_ = self.thresholds_
         self.joint_scores_ = self.scores_
     else:
         if y is None:
             raise ValueError("y must not be None for supervised binarizers.")
         # вынести в отдельную функцию
         # y = np.array(y)
         # if len(y.shape) == 1:
         #     self.classes_, y = np.unique(y, return_inverse=True)
         #     nclasses = self.classes_.shape[0]
         #     Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
         #     Y_new[np.arange(y.shape[0]), y] = 1
         # else:
         #     self.classes_ = np.arange(y.shape[1])
         #     Y_new = y
         label_binarizer = SK_LabelBinarizer()
         Y_new = label_binarizer.fit_transform(y)
         self.classes_ = label_binarizer.classes_
         if X.shape[0] != Y_new.shape[0]:
             raise ValueError("X and y have incompatible shapes.\n"
                              "X has %s samples, but y has %s." %
                              (X.shape[0], Y_new.shape[0]))
         self._fit_supervised(X, Y_new)
         if len(self.classes_) <= 2:
             self.joint_thresholds_ = self.thresholds_[:, 0]
             self.joint_scores_ = self.scores_[:, 0]
         else:
             min_class_scores = np.min(self.scores_, axis=0)
             max_class_scores = np.max(self.scores_, axis=0)
             diffs = max_class_scores - min_class_scores
             diffs[np.where(diffs == 0)] = 1
             normalized_scores = (self.scores_ - min_class_scores) / diffs
             # находим для каждого признака тот класс, для которого он наиболее полезен
             # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
             optimal_indexes = np.argmax(normalized_scores, axis=1)
             nfeat = self.thresholds_.shape[0]
             # в качестве порога бинаризации каждого признака
             # берём значение для класса, где он наиболее полезен
             self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
             self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
     # передаём пороги в sklearn.SK_Binarizer
     self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
     return self
Esempio n. 9
0
def do_logreg():
    from sklearn.preprocessing import Binarizer, scale
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score,classification_report
    from sklearn.cross_validation import train_test_split
    from sklearn.cross_validation import cross_val_score
    from sklearn.grid_search import GridSearchCV
    from scipy.stats import expon
    import pandas
    ### load data
    col_names=['mpg','cylinders','displacement','horsepower','weight',
               'acceleration','model_year','origin','car_name']
    df=pandas.read_csv('auto_mpg.csv')
    df.columns=col_names
    df=df.drop('car_name',1)
    
    lr=LogisticRegression()
    bn=Binarizer(threshold=df['mpg'].mean())
    print "Performing binarization of the mpg variable into above/below average classes"
    target=bn.fit_transform(df['mpg'])
    data=df.drop('mpg',1)
    data=scale(data)
    print "Splitting into training and test sets"
    data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)

    grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
    tuned_parameters=[{'C':grid}]
    clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
    clf.fit(data_train,target_train)
    for params, mean_score,_ in clf.grid_scores_:
        print "{}: Mean accuracy {}".format(params,mean_score)

    
    print  """Cross-validating above/below average mpg prediction
        using {}-fold validation on the test dataset.
        Using the best estimator: {}
        """.format(nfolds,clf.best_estimator_)
        
    mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))

    print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
Esempio n. 10
0
def us_crime():
    # US crime dataset

    filename = '../../data/raw/mldata/communities.data'

    # The missing data will be consider as NaN
    # Only use 122 continuous features
    tmp_data = np.genfromtxt(filename, delimiter = ',')
    tmp_data = tmp_data[:, 5:]

    # replace missing value by the mean
    imp = Imputer(verbose = 1)
    tmp_data = imp.fit_transform(tmp_data)

    # extract the data to be saved
    data = tmp_data[:, :-1]
    bn = Binarizer(threshold=0.65)
    label = np.ravel(bn.fit_transform(tmp_data[:, -1]))

    np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

if __name__ == '__main__':
    here = os.path.dirname(os.path.abspath(__file__))
    #Import mnist dataset
    mnist = tf.keras.datasets.mnist

    #Split in test and train
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    #Scale entries between(0,1)
    x_train = x_train/255
    x_test = x_test/255

    #Binarize pictures
    binarizer = Binarizer(threshold=0.5)
    x_train_binary = np.array([binarizer.fit_transform(slice) for slice in x_train])
    x_test_binary = np.array([binarizer.fit_transform(slice) for slice in x_test])

    #reshape pictures to be vectors and fix datatype
    x_train_binary = x_train_binary.reshape(x_train_binary.shape[0],-1).astype(np.float32)
    x_test_binary = x_test_binary.reshape(x_test_binary.shape[0],-1).astype(np.float32)

    one_hot_labels = np.zeros((60000,10))
    one_hot_labels[np.arange(60000), y_train] = 1

    dataset_train = tf.data.Dataset.from_tensor_slices(x_train_binary)
    dataset_train_labels = tf.data.Dataset.from_tensor_slices(y_train)
    dcombined = tf.data.Dataset.zip((dataset_train, dataset_train_labels)).batch(32)

Esempio n. 12
0
# -*- encoding: utf-8 -*-
'''
Created on 2016年5月22日

@author: LuoPei
'''

#整体并行
from numpy import log1p
# from numpy import log
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.pipeline import FeatureUnion

#新建将整体特征矩阵进行对数函数转换的对象
# step2_1=('ToLog',FunctionTransformer(loglp))
step2_1 = ('ToLog', FunctionTransformer(log1p))
#新建将整体特征矩阵进行二值化类的对象
step2_2 = ('ToBinary', Binarizer())

#新建整体并行处理对象

#该对象也有fit和transform 方法,fit和transform 方法均是并行地调用需要并行处理的对象的fit和transform 方法
#参数transformer_list为需要并行处理的对象列表,该列表为二元组列表,第一元为对象的名称,第二元为对象
step2 = ('FeatureUnion', FeatureUnion(transformer_list=[step2_1, step2_2]))

if __name__ == "__main__":
    pass

if __name__ == '__main__':

    #Import mnist dataset
    mnist = tf.keras.datasets.mnist

    #Split in test and train
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    #Scale entries between(0,1)
    x_train = x_train / 255
    x_test = x_test / 255

    #Binarize pictures
    binarizer = Binarizer(threshold=0.5)
    x_train_binary = np.array(
        [binarizer.fit_transform(slice) for slice in x_train])
    x_test_binary = np.array(
        [binarizer.fit_transform(slice) for slice in x_test])

    #reshape pictures to be vectors and fix datatype
    x_train_binary = x_train_binary.reshape(x_train_binary.shape[0],
                                            -1).astype(np.float32)
    x_test_binary = x_test_binary.reshape(x_test_binary.shape[0],
                                          -1).astype(np.float32)
    '''

    here = os.path.dirname(os.path.abspath(__file__))

    datah5 = dd.io.load(here + '/data/ising/ising_data_L32.hdf5')
Esempio n. 14
0
#---------------------------------------------------------------------------------------
#
#	Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------

all_bigr = ngram(X_train, 'bigram') #starting with all features

print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"

print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"


print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"


print "Starting feature selection using CART random forests on binary files"
news_data = extracted_data.iloc[:, :-1]      # Take up to the second last column
news_labels = extracted_data[' shares']      # Take shares column for labels

# Data Preprocessing
news_data_transpose = news_data.transpose()
data_into_dict = news_data_transpose.to_dict()
list_data = [v for k, v in data_into_dict.iteritems()]

# Encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
transformed_data = dv.fit_transform(list_data).toarray()

# Label Encoder - Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=1400)                           # Threshold at 1400 because median of shares is 1400
transformed_labels = binarizer.transform(news_labels)
transformed_labels = transformed_labels.transpose().ravel()     # .ravel() is to fix "Too many array indices error"
                                                                # Could be a scikit or pandas bug
############## Classification #################

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

# Decision Tree Classifier
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
Esempio n. 16
0
sd_scaler = StandardScaler()
X_sd_sc = sd_scaler.fit_transform(X) # ndarray
print(X_sd_sc)
# (3)标准化数据 (normalize data):适合处理稀疏数据
# 距离变为1(单位化?)。。。"归一元"处理
# 将每一行的数据的距离处理成1的数据???
# 使用权重输入的神经网络 和 使用距离的KNN
from sklearn.preprocessing import Normalizer # 范数
nm_scaler = Normalizer(norm='l2')
X_nm_sc = nm_scaler.fit_transform(X)
print(X_nm_sc)
# (4)二值数据 (binarize data) :生成明确值或增加属性 
# 大于阈值设为1,小于阈值设为0
# 在生成明确值或增加属性的时候使用
from sklearn.preprocessing import Binarizer
bizer = Binarizer(threshold= 0.0)
X_b_sc = bizer.fit_transform(X)
print(X_b_sc)

#%% 第九章 数据特征选定
# 特征选择(4个方法)
# 通过sklearn来自动选择用于机器学习模型的数据特征的方法
# 减少无关的、冗余的,可以提高算法精度及训练时间

#(1)单变量特征选定
# =============================================================================
# 统计分析可以用来分析选择"对结果影响最大"的数据特征
# sklearn中提供了SelectKBest类,可以实现卡方检验
# 卡方检验是检验定性自变量对定性因变量的相关性的方法
# 假设自变量N种取值,应变量M种取值,
# 考虑x=i&y=j的样本频数的观察值与期望值的差距,构建统计量
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8333333333333333
exported_pipeline = make_pipeline(
    Binarizer(threshold=0.1),
    LogisticRegression(C=20.0, dual=True, penalty="l2"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 18
0
# Binarization
from sklearn.preprocessing import Binarizer
import pandas as pd

filename = 'indians-diabetes.data.csv'
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]

dataframe = pd.read_csv(filename, names=names)
array = dataframe.values

# separate array into input and output components
X = array[:, 0:8]
Y = array[:, 8]

binarizer = Binarizer(threshold=5)

binaryX = binarizer.fit_transform(X)
# summarize transformed data
print(binaryX[0:30, :])
dataSizeStatisticsTest = zeros(numCombinations, dtype=float)

varSizeStatisticsTrain = zeros(numCombinations, dtype=float)
varSizeStatisticsTest = zeros(numCombinations, dtype=float)

a = 0

mnist = fetch_mldata('MNIST original')

# split a training set and a test set
y_train, y_test = mnist.target[:60000], mnist.target[60000:70000]

#vectorizer = CountVectorizer(binary=True)
X_both = mnist.data

binarizer = Binarizer().fit(50,X_both)
X_both = binarizer.transform(X_both)

X_train = X_both[:60000]
X_test =  X_both[60000:70000]

#print X_train[1]

#ch2 = SelectKBest(chi2, 750)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)

data_train = X_train
m,n = data_train.shape

print m," ",n
Esempio n. 20
0
def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, -1]])

    for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)
        X_bin = binarizer.transform(X)
        assert_equal(sparse.issparse(X), sparse.issparse(X_bin))

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        if init is not list:
            assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

    binarizer = Binarizer(threshold=-0.5, copy=True)
    for init in (np.array, list):
        X = init(X_.copy())

        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 1)
        assert_equal(np.sum(X_bin == 1), 5)
        X_bin = binarizer.transform(X)

    # Cannot use threshold < 0 for sparse
    assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
Esempio n. 21
0
           'LR': LogisticRegression,
           'LSVC' : LinearSVC,
           'SVC' : SVC
           }

#%%
    
os.chdir(workspace)

dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy"))
val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy"))

dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy"))
val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy"))

binarizer = Binarizer(copy=True, threshold=thresh)
imputer = Imputer(copy = False)

dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],))
val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],))

"""
from statsmodels.regression import quantile_regression

dev_idvs2 = dev_idvs[:10000,:]
inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1]
dev_dvs2 = dev_dvs[:10000,:].reshape((10000,))

model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2)
model.fit()
"""
Esempio n. 22
0
class Binarizer(TransformerMixin):
    """
    Реализует различные стратегии бинаризации признаков,
    вычисляя оптимальные пороги и производя бинаризацию с данными порогами

    Аргументы:
    ----------
    method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков
    divide_to_bins: bool(optional, default=True),
        индикатор приведения количественных признаков к целочисленным
    bins_number: int(optional, default=10),
        число возможных значений целочисленных признаков при бинаризации
    """
    _UNSUPERVISED_METHODS = ['random']
    _SUPERVISED_METHODS = ['log_odds', 'bns']
    _CONTINGENCY_METHODS = ['log_odds', 'bns']

    def __init__(self, method, divide_to_bins=True, bins_number=10):
        self.method = method
        self.divide_to_bins = divide_to_bins
        self.bins_number = bins_number

    def fit(self, X, y=None):
        """
        Обучает бинаризатор на данных
        """
        # print("Fitting binarizer...")
        methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
        if self.method not in methods:
            raise ValueError("Method should be one of {0}".format(", ".join(methods)))
        X = check_array(X, accept_sparse=['csr', 'csc'])
        if issparse(X):
            X = X.tocsc()
        if self.method in Binarizer._UNSUPERVISED_METHODS:
            self._fit_unsupervised(X)
            self.joint_thresholds_ = self.thresholds_
            self.joint_scores_ = self.scores_
        else:
            if y is None:
                raise ValueError("y must not be None for supervised binarizers.")
            # вынести в отдельную функцию
            # y = np.array(y)
            # if len(y.shape) == 1:
            #     self.classes_, y = np.unique(y, return_inverse=True)
            #     nclasses = self.classes_.shape[0]
            #     Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
            #     Y_new[np.arange(y.shape[0]), y] = 1
            # else:
            #     self.classes_ = np.arange(y.shape[1])
            #     Y_new = y
            label_binarizer = SK_LabelBinarizer()
            Y_new = label_binarizer.fit_transform(y)
            self.classes_ = label_binarizer.classes_
            if X.shape[0] != Y_new.shape[0]:
                raise ValueError("X and y have incompatible shapes.\n"
                                 "X has %s samples, but y has %s." %
                                 (X.shape[0], Y_new.shape[0]))
            self._fit_supervised(X, Y_new)
            if len(self.classes_) <= 2:
                self.joint_thresholds_ = self.thresholds_[:, 0]
                self.joint_scores_ = self.scores_[:, 0]
            else:
                min_class_scores = np.min(self.scores_, axis=0)
                max_class_scores = np.max(self.scores_, axis=0)
                diffs = max_class_scores - min_class_scores
                diffs[np.where(diffs == 0)] = 1
                normalized_scores = (self.scores_ - min_class_scores) / diffs
                # находим для каждого признака тот класс, для которого он наиболее полезен
                # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
                optimal_indexes = np.argmax(normalized_scores, axis=1)
                nfeat = self.thresholds_.shape[0]
                # в качестве порога бинаризации каждого признака
                # берём значение для класса, где он наиболее полезен
                self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
                self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
        # передаём пороги в sklearn.SK_Binarizer
        self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
        return self

    def transform(self, X):
        """
        Применяем бинаризатор к данным
        """
        print("Transforming binarizer...")
        if hasattr(self, 'binarize_transformer_'):
            return self.binarize_transformer_.transform(X)
        else:
            raise ValueError("Transformer is not fitted")

    def _fit_unsupervised(self, X):
        """
        Управляющая функция для методов подбора порога без учителя
        """
        if self.method == 'random':
            # случайные пороги и полезности
            if issparse(X):
                minimums = X.min(axis=0).toarray()
                maximums = X.max(axis=0).toarray()
            else:
                minimums = np.min(X, axis=0)
                maximums = np.max(X, axis=0)
            random_numbers = np.random.rand(X.shape[1], 1).reshape((X.shape[1],))
            self.thresholds_ = minimums + (maximums - minimums) * random_numbers
            self.scores_ = np.random.rand(X.shape[1], 1).reshape((X.shape[1],))
        return self

    def _fit_supervised(self, X, y):
        """
        Выполняет подбор порогов с учителем
        """
        # приводим X к целочисленным значениям, если нужно
        if self.divide_to_bins:
            bin_divider = BinDivider(bins_number=self.bins_number)
            X = bin_divider.fit_transform(X)
        thresholds, scores = [], []
        for i in range(X.shape[1]):
            threshold, score = self._find_optimal_thresholds(X[:, i], y)
            thresholds.append(threshold)
            scores.append(score)
        self.thresholds_ = np.asarray(thresholds, dtype=np.float64)
        self.scores_ = np.asarray(scores, dtype=np.float64)
        return self

    def _find_optimal_thresholds(self, column, y):
        """
        Вычисляет пороги для бинаризации

        Аргументы:
        ----------
        column: array-like, shape=(nobj,), колонка значений признаков
        y: array-like, shape=(nobj, nclasses), 0/1-матрица классов
        """
        classes_number = y.shape[1]
        # вычисляем частоты встречаемости признаков для разных классов
        values, counts = \
            _collect_column_statistics(column, y, classes_number=classes_number, precision=6)
        if self.method in Binarizer._CONTINGENCY_METHODS:
            # бинарная классификация
            if classes_number <= 2:
                counts = [counts]
            else:
                summary_counts = np.sum(counts, axis=1)
                counts = [np.array((summary_counts - counts[:, i], counts[:, i])).T
                          for i in np.arange(classes_number)]
            best_thresholds = [None] * len(counts)
            best_scores = [None] * len(counts)
            for i in np.arange(len(counts)):
                current_thresholds, current_tables = \
                    _collect_contingency_tables(values, counts[i])
                if self.method == "log_odds":
                    func = (lambda x: odds_ratio(x, alpha=0.1))
                elif self.method == 'information_gain':
                    func = information_gain
                elif self.method == 'bns':
                    func = bns
                else:
                    raise ValueError("Wrong binarization method: {0}".format(self.method))
                scores = [func(table) for table in current_tables]
                best_score_index = np.argmax(scores)
                best_thresholds[i] = current_thresholds[best_score_index]
                best_scores[i] = scores[best_score_index]
        return best_thresholds, best_scores
Esempio n. 23
0
train.isnull().sum()

sns.distplot(train['Upvotes']); 
 
plt.show()

train = train.drop(train[train.Views > 3000000].index)

labelencoder_X = LabelEncoder()
train['Tag'] = labelencoder_X.fit_transform(train['Tag'])
train.drop(['ID','Username'], axis=1,inplace =True)
target = train['Upvotes']

from sklearn.preprocessing import Binarizer
bn = Binarizer(threshold=7)
pd_watched = bn.transform([train['Answers']])[0]
train['pd_watched'] = pd_watched

train.head()

feature_names = [x for x in train.columns if x not in ['Upvotes']]

x_train, x_val, y_train, y_val = train_test_split(train[feature_names], target,test_size = 0.22,random_state =205)
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_val = sc_X.transform(x_val)

poly_reg = PolynomialFeatures(degree = 4,interaction_only=False, include_bias=True)
X_poly = poly_reg.fit_transform(x_train)
poly_reg.fit(x_train, y_train)
Esempio n. 24
0
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(),
                     PolynomialFeatures(), RobustScaler(), StandardScaler(),
                     FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(),
                     SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
                     SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
                     RFE(estimator=ExtraTreesClassifier(n_estimators=100))]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    for (preprocessor, learning_rate, n_estimators, max_depth) in itertools.product(
                preprocessor_list,
                [0.01, 0.1, 0.5, 1.0, 10.0, 50.0, 100.0],
Esempio n. 25
0
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    # print('loaded_text', loaded_text)
    X, y = np.hsplit(loaded_text,  [-1])
    y = y.flatten()
    return X, y

def get_column_names(path):
    with open(path) as fp:
        header = fp.readline().split(',')#[1:-1]
    return header

X, y = get_training_data(data_path)
letter_names = X[:, 0].reshape(-1, 1)
letter_sounds = X[:, 1].reshape(-1, 1)

# Binarize labels
y = Binarizer(threshold=fail_threshold).transform(y.reshape(1, -1))[0]

reading_data = (X, y)

datasets = [
    reading_data
]

# points where we want ticks, as well as the label for that tick
ticks = [
    [0, 0],
    [13, 7],
    [26, 13],
    [39, 20],
    [52, 26]
]
Esempio n. 27
0
from sklearn.neural_network import MLPClassifier

Data_reader = csv_reader.CsvReader('../DataSet')
Data_writer = csv_reader.CsvReader('../output')

if __name__ == '__main__':

    train_data = Data_reader.read_data('train.csv')
    test_data = Data_reader.read_data('test.csv')

    # X_train = train_data.iloc[:, 1:]
    # Y_train = train_data.iloc[:, 0]
    # X_test = test_data

    train_data_bin = Binarizer(threshold=127).fit_transform(train_data)
    X_test_bin = Binarizer(threshold=127).fit_transform(test_data)
    X_train = pd.DataFrame(train_data_bin).iloc[:, 1:]
    Y_train = train_data.iloc[:, 0]
    X_test = pd.DataFrame(X_test_bin)

    train_images, vali_images, train_labels, vali_labels = \
        train_test_split(X_train, Y_train, train_size=0.95,random_state=1)
    print('start predict')

    predict = MLPClassifier(solver='lbfgs',
                            alpha=1e-5,
                            hidden_layer_sizes=(100, ),
                            random_state=5)

    predict.fit(train_images, train_labels)
Esempio n. 28
0

# In[3]:

# Import csv data
raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:]      # read in csv, omit the first column of url
raw_data = raw_data.iloc[:, :-1] 
news_data = raw_data.iloc[:, :-1]      # Take up to the second last column
news_labels = raw_data.iloc[:, -1]      # Take shares column for labels

# Binarize
print '\nBinary Threshold:'
binary_threshold = np.median(raw_data[' shares'])
news_data = news_data.drop(' n_non_stop_words', 1)
print binary_threshold
binarizer = Binarizer(threshold=binary_threshold)
y_binary = binarizer.transform(news_labels).transpose().ravel() 


# In[ ]:

# Discretize


# In[25]:

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
print 'Decision Tree Classifier Accuracy Rate'
tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
import deepdish as dd
from bokeh.plotting import figure
from bokeh.io import export_svgs

machine = RBM(32 * 32, 600, 100, (32, 32), 32, 'cd')
machine.from_saved_model(
    '/cluster/home/fdangelo/Restricted-Boltzmann-Machines/logs/scalars/1604-004823'
)
datah5 = dd.io.load(
    '/Users/fdangelo/PycharmProjects/myRBM/data/ising/ising_data_complete.hdf5'
)
data_bin = {}
datah5_norm = {}
#Take spin up as standard configuration
keys = list(datah5.keys())
binarizer = Binarizer(threshold=0)
for key in keys:
    datah5_norm[key] = np.array(
        [np.where(np.sum(slice) < 0, -slice, slice) for slice in datah5[key]])
    data_bin[key] = np.array([
        binarizer.fit_transform(slice) for slice in datah5_norm[key]
    ]).reshape(datah5_norm[key].shape[0], -1).astype(np.float32)

magn_key_mean = []
for key in keys:
    print(data_bin[key].shape[0])
    magn_23 = np.array(
        [np.mean(data_bin[key][i]) for i in range(data_bin[key].shape[0])])
    magn_key_mean.append(np.mean(magn_23))

steps = [200, 1000, 10000, 100000, 1000000]
Esempio n. 30
0
# binarization
from sklearn.preprocessing import Binarizer
import pandas
import numpy
url = "https://goo.gl/vhm1eU"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(binaryX[0:5,:])
Esempio n. 31
0
	try:
		hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) # step 1 only
	except NameError:
		head_node = each[0]
		hub_ego=nx.ego_graph(Gsim,head_node,radius = 1)

	index = hub_ego.nodes()
#	pec = random.uniform(0.5,0.8) # percentage of nodes selected between [0.5,0.8]
	pec = 0.8
	random.shuffle(index)
	subidx = index[:int(pec*len(index))]
	Y = np.zeros(num_sample)
	Y[::5] += 3 * (0.5-np.random.rand(num_sample/5)) # add noise to targets
	for each in subidx[1:]:
		Y += np.power(data_mat[:,each],3)
	binarizer = Binarizer()
	label = binarizer.transform(Y)

	# output the gene expression matrix
	ofp = open('nonlinear2.'+str(i)+'.genemat','w')
	for each in sorted(Gsim.nodes()):
		print >> ofp, str(each)+'\t'+'\t'.join(map(str,data_mat[:,each]))
	print >> ofp, 'outcome\t'+'\t'.join(map(str,label))
	ofp.close()
	#print 'significant network',index
	nx.write_adjlist(Gsim,"nonlinear2."+str(i)+".adjlist")
	os.system('epd_python svmnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.svm.'+str(i)+'.txt -s 0')
#	os.system('epd_python ../rfnet.py -n nonlinear2.adjlist -g nonlinear2.genemat -o nonlinear2.rf.txt -s 0 -r 20')
	os.system('epd_python knnnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.knn.'+str(i)+'.txt -s 0')
	svm_count += count_net('nonlinear2.svm.'+str(i)+'.txt',index)
	#rf_count += count_net('nonlinear2.rf.txt',index)
_, n_features = X.get_shape()

print('Loading test data...')
with open('data/test-svmlight.dat') as infile:
	lines = infile.readlines()
	n_samples = len(lines)
	test = lil_matrix((n_samples, n_features))
	for n,line in enumerate(lines):
		for word_count in line.split():
			fid, count = word_count.split(':')
			test[n,int(fid)] = int(fid)
test = test.tocsr()

if opts.binarize:
	print('Binarizing the data...')
	binar = Binarizer(copy=False)
	X = binar.transform(X)
	test = binar.transform(test)

if opts.tfidf:
	print('Transforming word occurrences into TF-IDF...')
	tranny = TfidfTransformer()
	X = tranny.fit_transform(X)
	test = tranny.transform(test)

if opts.select_features:
	k_features = int(opts.k_features)
	if opts.select_features == 'k-best':
		print('Selecting %i best features...' % k_features)
		ch2 = SelectKBest(chi2, k=k_features)
	if opts.select_features == 'pct':
Esempio n. 33
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=71)

# Average CV score on the training set was:0.7526956521739131
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    Binarizer(threshold=0.30000000000000004),
    RandomForestClassifier(bootstrap=True,
                           criterion="entropy",
                           max_features=0.25,
                           min_samples_leaf=5,
                           min_samples_split=4,
                           n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 34
0
            except NameError:
                head_node = each[0]
                hub_ego = nx.ego_graph(Gsim, head_node, radius=1)
            index = shuffle(hub_ego.nodes(), random_state=random_state)
            head_set = set([head_node])
            subidx = set(index[:int(pec * len(index))])
            if head == 'in':
                subidx |= head_set
            else:
                subidx -= head_set
            Y = np.zeros(num_sample)
            Y[::5] += 3 * (0.5 - np.random.rand(num_sample / 5)
                           )  # add noise to targets
            for each in subidx:
                Y += np.power(data_mat[:, each], 3)
            binarizer = Binarizer()
            label = binarizer.transform(Y)

            # output the gene expression matrix
            ofp = open('nonlinear.genemat', 'w')
            for each in sorted(Gsim.nodes()):
                print >> ofp, str(each) + '\t' + '\t'.join(
                    map(str, data_mat[:, each]))
            print >> ofp, 'outcome\t' + '\t'.join(map(str, label))
            ofp.close()

            # output the network adjlist file
            nx.write_adjlist(Gsim, "nonlinear.adjlist")

            os.system(
                'epd_python egonet.py -n nonlinear.adjlist -g nonlinear.genemat -o nonlinear.egonet.txt -s 0'
Esempio n. 35
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.518333333333
exported_pipeline = make_pipeline(
    Binarizer(threshold=0.45),
    GradientBoostingClassifier(learning_rate=1.0, max_depth=7, max_features=0.55, min_samples_leaf=1, min_samples_split=13, n_estimators=100, subsample=0.35)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 36
0
def solucao_casao():
    verilog = open('pla_all_training.v', 'w')

    i, o, p = read_iop("PLA_dump/pla_cifar10_chunk_500_train.pla")
    ts = tt2df("PLA_dump/pla_cifar10_chunk_500_train.pla")
    vs = tt2df("PLA_dump/pla_cifar10_chunk_499_valid.pla")

    print(ts.iloc[:, -10:].values)

    binarizer = Binarizer(threshold=0.0).fit(ts)
    binary_train = binarizer.transform(ts)

    binarizer = Binarizer(threshold=0.0).fit(vs)
    binary_valid = binarizer.transform(vs)

    print(type(ts))
    print(type(binary_train))

    ts = pd.DataFrame(binary_train)
    vs = pd.DataFrame(binary_valid)

    # Split features and the target variable
    X_train = ts.iloc[:, :-10].values
    y_train = ts.iloc[:, -10:].values
    X_val = vs.iloc[:, :-10].values
    y_val = vs.iloc[:, -10:].values

    print(X_train.shape)
    print(y_train.shape)
    print(type(y_train[0]))
    print(type(y_train[0][0]))
    print(y_train[0][0])
    print(y_train[1][0])
    print(y_train[2][0])
    print(y_train[3][0])
    print(y_train)

    # Definition of the classifier
    clf = DecisionTreeClassifier(
        random_state=9856230,
        criterion='gini',
        max_depth=18,
    )

    # Training and validation of the classifier
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_val)
    y_predicted_training = clf.predict(X_train)

    # Generate a Verilog description of the classifier.
    # TODO: create a function out of this code snippet.
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    # print("n_nodes: %s" % str(n_nodes))
    # print("children_left: %s" % str(children_left))
    # print("children_right: %s" % str(children_right))
    # print("feature : %s" % str(feature))
    # print("threshold : %s" % str(threshold))

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]
    while len(stack) > 0:
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        is_split_node = children_left[node_id] != children_right[node_id]
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    verilog.write('module top (')
    verilog.write('\t' + ', '.join(['x{}'.format(v)
                                    for v in np.arange(0, i)]) + ', ')
    # verilog.write('\t' + ', '.join(['y{}'.format(v) for v in np.arange(0, o)]) + 'y);\n')
    verilog.write(', '.join(['y{}'.format(v)
                             for v in np.arange(0, o)]) + ');\n')
    # verilog.write('\t' + 'y);\n')
    verilog.write('input ' +
                  ', '.join(['x{}'.format(v)
                             for v in np.arange(0, i)]) + ';\n')
    verilog.write('output ' +
                  ', '.join(['y{}'.format(v)
                             for v in np.arange(0, o)]) + ';\n')
    # verilog.write('output y;\n')
    verilog.write('wire ' +
                  ', '.join(['n{}'.format(v)
                             for v in np.arange(0, n_nodes)]) + ';\n')
    for i in range(n_nodes):
        if is_leaves[i]:
            verilog.write('assign n{node} = {out_class};\n'.format(
                node=i, out_class=np.argmax(clf.tree_.value[i])))
        else:
            verilog.write(
                'assign n{node} = x{feature} ^ 1\'b1 ? n{left} : n{right};\n'.
                format(node=i,
                       feature=feature[i],
                       left=children_left[i],
                       right=children_right[i]))
    verilog.write('assign y = n0;\n')
    verilog.write('endmodule')
    verilog.close()

    with open("tree_test.tree", "w") as arquivo:
        arquivo.write(tree.export_text(clf, max_depth=1000))
Esempio n. 37
0
def test_fit_transform():
    X = np.random.random((5, 4))
    for obj in ((Scaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev, step)
        tf.summary.scalar('max', tf.reduce_max(var), step)
        tf.summary.scalar('min', tf.reduce_min(var), step)
        tf.summary.histogram('histogram', var, step = step)

if __name__ == '__main__':
    #Load and preprocess data
    here = os.path.dirname(os.path.abspath(__file__))

    datah5 = dd.io.load('/cluster/scratch/fdangelo/RBM/data/ising/ising_data_L32_large.hdf5')


    # Transform -1 in 0 and take spin up as standard configuration
    binarizer = Binarizer(threshold=0)
    keys = list(datah5.keys())
    # put here the temperature from keys that you want to use for the training
    #class_names = [keys[i] for i in [4, 6, 7, 8, 9, 10, 11, 12, 16]]
    class_names = [keys[i] for i in [6,8,9,10,11,12,16]]
    n_samples = datah5[keys[0]].shape[0]
    datah5_norm = {}
    data_bin = {}
    for key in class_names:
      if key==keys[9]:
        datah5_norm[key] = np.array([np.where(np.sum(slice)<0,-slice,slice) for slice in datah5[key]])
        data_bin[key] = np.array([binarizer.fit_transform(slice) for slice in datah5_norm[key]])
      else:
        #datah5_norm[key] = np.array([np.where(np.sum(slice)<0,-slice,slice) for slice in datah5[key]])
        data_bin[key] = np.array([binarizer.fit_transform(slice) for slice in datah5[key]])
Esempio n. 39
0
def testPreProc():

    iris = load_iris()
    # 无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。
    # 标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。
    # 归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时,
    # 拥有统一的标准,也就是说都转化为“单位向量”。
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    MinMaxScaler().fit_transform(iris.data)
    StandardScaler().fit_transform(iris.data)

    # 二值化,阈值设置为3,返回值为二值化后的数据
    from sklearn.preprocessing import Binarizer
    Binarizer(threshold=3).fit_transform(iris.data)

    # 哑编码,对IRIS数据集的目标值,返回值为哑编码后的数据,注意是2D的
    # OneHotEncoder(sparse = False).fit_transform( testdata[['age']] )
    from sklearn.preprocessing import OneHotEncoder
    OneHotEncoder().fit_transform(iris.target.reshape((-1, 1)))

    # 对于字符串型离散变量可以先用LabelEncoder 转换为数值再用OneHotEncoder编码
    # 注意LabelEncoder是1D而OneHotEncoder是2D的
    from sklearn.preprocessing import LabelEncoder
    LabelEncoder().fit_transform(iris.data[""])

    # 缺失值计算,返回值为计算缺失值后的数据
    # 参数missing_value为缺失值的表示形式,默认为NaN
    # 参数strategy为缺失值填充方式,默认为mean(均值)
    from numpy import vstack, array, nan
    from sklearn.preprocessing import Imputer
    Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data)))

    # 数据变换
    # 多项式变换
    from sklearn.preprocessing import PolynomialFeatures  # 多项式转换 #参数degree为度,默认值为
    PolynomialFeatures().fit_transform(iris.data)
    # 自定义转换函数为对数函数的数据变换 #第一个参数是单变元函数
    from numpy import log1p
    from sklearn.preprocessing import FunctionTransformer
    FunctionTransformer(log1p).fit_transform(iris.data)

    # 特征选择之filter
    # 方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值
    from sklearn.feature_selection import VarianceThreshold
    VarianceThreshold(threshold=3).fit_transform(iris.data)
    # 选择K个最好的特征,返回选择特征后的数据
    # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
    # 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
    # 第二个参数k为选择的特征个数
    from sklearn.feature_selection import SelectKBest
    from scipy.stats import pearsonr
    # 评价函数为 pearsonr 相关系数
    SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y), X.T)).T,
                k=2).fit_transform(iris.data, iris.target)
    # 评价函数为 卡方检验函数
    from sklearn.feature_selection import chi2
    SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
    # 互信息法
    # from minepy import MINE
    # # 由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
    # def mic(x, y):
    #     m = MINE()
    #     m.compute_score(x, y)
    #     return (m.mic(), 0.5)
    # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)

    # 特征选择之wrapper
    # 递归特征消除法,返回特征选择后的数据
    # 参数estimator为基模型
    # 参数n_features_to_select为选择的特征个数
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    RFE(estimator=LogisticRegression(),
        n_features_to_select=2).fit_transform(iris.data, iris.target)

    # 特征选择之embedded
    # 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。
    # 使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型,来选择特征:
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LogisticRegression
    # 带L1惩罚项的逻辑回归作为基模型的特征选择
    SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(
        iris.data, iris.target)

    # L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。
    # 故可结合L2惩罚项来优化。若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合,
    # 将这一集合中的特征平分L1中的权值

    # GBDT作为基模型的特征选择
    from sklearn.ensemble import GradientBoostingClassifier
    SelectFromModel(GradientBoostingClassifier()).fit_transform(
        iris.data, iris.target)
#导入数据
from sklearn.datasets import load_iris
iris = load_iris()
iris.data
iris.target
#数据预处理
  #数据标准化
from sklearn.preprocessing import StandardScaler
StandardScaler().fit_transform(iris.data)
  #数据区间化、归一化
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler().fit_transform(iris.data)

  #对定量特征二值化
from sklearn.preprocessing import Binarizer
Binarizer(threshold=3).fit_transform(iris.data)
  #对定性特征哑编码
  
  #When a qualitative predictor has more than two levels, a single dummy
#variable cannot represent all possible values. In this situation, we can create
#additional dummy variables. For example, for the ethnicity variable we
#create two dummy variables. The first could be equation 3.28 in AN INTRODUCTION TO STATISTICAL LEARNING;
  #and the second could be equation 3.29 in AN INTRODUCTION TO STATISTICAL LEARNING
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder().fit_transform(iris.target.reshape((-1,1)))

  #数据缺失处理
from sklearn.preprocessing import Imputer
from numpy import vstack, array, nan
  #缺失值计算,返回值为计算缺失值后的数据
  #参数missing_value为缺失值的表示形式,默认为NaN
    FunctionTransformer,
    KBinsDiscretizer,
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    PowerTransformer,
    StandardScaler,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

_transformers = [
    Binarizer(threshold=2),
    KBinsDiscretizer(n_bins=3, encode="ordinal"),
    StandardScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    FunctionTransformer(np.log, validate=True),
    OrdinalEncoder(),
]

_selectors = [
    SelectFromModel(Lasso(random_state=1)),
    SelectKBest(f_regression, k=2),
    VarianceThreshold(),
    RFE(Lasso(random_state=1)),
]
Esempio n. 42
0
# In[ ]:

model = build_neural_network()
restorer = tf.train.Saver()
with tf.Session() as sess:
    restorer.restore(sess, "./titanic.ckpt")
    feed = {model.inputs: test_data, model.is_training: False}
    test_predict = sess.run(model.predicted, feed_dict=feed)

test_predict[:10]

# In[ ]:

from sklearn.preprocessing import Binarizer

binarizer = Binarizer(0.5)
test_predict_result = binarizer.fit_transform(test_predict)
test_predict_result = test_predict_result.astype(np.int32)
test_predict_result[:10]

# In[ ]:

passenger_id = test_passenger_id.copy()
evaluation = passenger_id.to_frame()
evaluation["Survived"] = test_predict_result
evaluation[:10]

# In[ ]:

evaluation.to_csv("evaluation_submission.csv", index=False)
Esempio n. 43
0
def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, 0]])

    for init in (np.array, sp.csr_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)
Esempio n. 44
0
import pandas as pd
import matplotlib.pyplot as plt

eider.s3.download('s3://eider-datasets/mlu/Book_Ratings.csv',
                  '/tmp/Book_Ratings.csv')
data = pd.read_csv('tmp/Book_Ratings.csv',
                   dtype={
                       'User': str,
                       'ASIN': str,
                       'Rating': np.int
                   })
train, test = train_test_split(data,
                               random_state=8675309,
                               stratify=data['ASIN'])

binarizer = Binarizer(threshold=0, copy=True)

S_train = train.pivot_table(index='User',
                            columns='ASIN',
                            values='Rating',
                            fill_value=0).as_matrix()
R_train = binarizer.fit_transform(S_train)
S_test = test.pivot_table(index='User',
                          columns='ASIN',
                          values='Rating',
                          fill_value=0).as_matrix()
R_test = binarizer.fit_transform(S_test)

uniqueUsers = data['User'].unique().tolist()
uniqueASINs = data['ASIN'].unique().tolist()
numUser = len(uniqueUsers)
Esempio n. 45
0
x_all = np.delete(data_train, 0, 1)  #get first column

#%% Pre-processing on X training data

#note that these are four methods we tried to normalize the data
#only a single methods was used at a time

# (1) normalize data (TFIDF)
tfdif = TfidfTransformer(norm='l1')
x_tfdif = tfdif.fit_transform(x_all)

# (2) log normalization
x_log = np.log(1 + x_all)

# (3) binary normalization (convert all non-zero entries to 1)
binar = Binarizer()
x_bin = binar.fit_transform(x_all)

# (4) normalize w.r.t each feature
normal = Normalizer(norm='l2')
x_normal = normal.fit_transform(x_all)

#generate a test-train split for validation (does not mean cross-validation)
# note that the random state is set for exact recall
x_train0, x_test0, y_train0, y_test0 = train_test_split(x_all,
                                                        y_all,
                                                        test_size=0.05,
                                                        random_state=0)

#%% Utility function to report best scores
# # Binarization

# In[6]:

watched = np.array(popsong_df['listen_count']) 
watched[watched >= 1] = 1
popsong_df['watched'] = watched
popsong_df.head(10)


# In[7]:

from sklearn.preprocessing import Binarizer

bn = Binarizer(threshold=0.9)
pd_watched = bn.transform([popsong_df['listen_count']])[0]
popsong_df['pd_watched'] = pd_watched
popsong_df.head(11)


# # Rounding

# In[8]:

items_popularity = pd.read_csv('datasets/item_popularity.csv', encoding='utf-8')
items_popularity


# In[9]:
Esempio n. 47
0
from Models import InteractionFeatures, Model, Bounder, RemoveDuplicateCols, ReturnSame, f1, lad

from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

#%%

os.chdir(workspace)

logging.config.fileConfig('loggerConfig.properties')

logger = logging.getLogger('alllog')
logger.debug("Starting...")

binarizer = Binarizer(copy=True, threshold=thresh)

featureunion1 = FeatureUnion([
                              #('duplicater',ReturnSame()),
                              ('if+',InteractionFeatures(method = lambda x,y:(x+y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if-',InteractionFeatures(method = lambda x,y:(x-y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if*',InteractionFeatures(method = lambda x,y:(x*y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if/',InteractionFeatures(method = lambda x,y:(x/y), threshold = corr_thresh,subsample = 1,logger=logger)),
                              ('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger))
                               ])
                             
pp_pipeline = Pipeline([
                        ('removedupes',RemoveDuplicateCols(logger=logger)),
                        ('featureextraction',featureunion1),
                        ('bounder',Bounder(inf,-inf))
                        ])
Esempio n. 48
0
def binarize(img, threshold):
    binarizer = Binarizer(threshold, copy=False)
    return binarizer.fit_transform(img)
Esempio n. 49
0
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
         samples_on='rows', **kwargs):
    """Load a specified dataset.

    This function can be used either to load one of the standard scikit-learn
    datasets or a different dataset saved as X.npy Y.npy in the working
    directory.

    Parameters
    -----------
    opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
          'custom', 'GSEXXXXX'}, default: 'custom'
        Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
        'boston', 'circles' and 'moons' refer to the correspondent
        `scikit-learn` datasets. 'custom' can be used to load a custom dataset
        which name is specified in `x_filename` and `y_filename` (optional).

    x_filename : string, default : None
        The data matrix file name.

    y_filename : string, default : None
        The label vector file name.

    n_samples : int
        The number of samples to be loaded. This comes handy when dealing with
        large datasets. When n_samples is less than the actual size of the
        dataset this function performs a random subsampling that is stratified
        w.r.t. the labels (if provided).

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    data_sep : string
        The data separator. For instance comma, tab, blank space, etc.

    Returns
    -----------
    X : array of float, shape : n_samples x n_features
        The input data matrix.

    y : array of float, shape : n_samples
        The label vector; np.nan if missing.

    feature_names : array of integers (or strings), shape : n_features
        The feature names; a range of number if missing.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.
    """
    data = None
    try:
        if opt.lower() == 'iris':
            data = datasets.load_iris()
        elif opt.lower() == 'digits':
            data = datasets.load_digits()
        elif opt.lower() == 'diabetes':
            data = datasets.load_diabetes()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'boston':
            data = datasets.load_boston()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'gauss':
            means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
            sigmas = np.array([0.33, 0.33, 0.33])
            if n_samples <= 1:
                n_samples = 333
            xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'circles':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
                                           noise=.05)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'moons':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'custom':
            data = load_custom(x_filename, y_filename, samples_on, **kwargs)
        elif opt.lower().startswith('gse'):
            raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
                            "into csv files.")
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))

    X, y = data.data, data.target
    if n_samples > 0 and X.shape[0] > n_samples:
        if y is not None:
            try:  # Legacy for sklearn
                sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
                # idx = np.random.permutation(X.shape[0])[:n_samples]
            except TypeError:
                sss = StratifiedShuffleSplit(test_size=n_samples) \
                    .split(X, y)
            _, idx = list(sss)[0]
        else:
            idx = np.arange(X.shape[0])
            np.random.shuffle(idx)
            idx = idx[:n_samples]

        X, y = X[idx, :], y[idx]
    else:
        # The length of index must be consistent with the number of samples
        idx = np.arange(X.shape[0])

    feat_names = data.feature_names if hasattr(data, 'feature_names') \
        else np.arange(X.shape[1])
    index = np.array(data.index)[idx] if hasattr(data, 'index') \
        else np.arange(X.shape[0])

    return X, y, feat_names, index
Esempio n. 50
0
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder

onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()

x = ['a', 'b', 'c']

label_x = label_encoder.fit_transform(x).reshape([len(x), 1])
print(label_x)
print(onehot_encoder.fit_transform(label_x).toarray())

binarizer = Binarizer(threshold=1.0).fit(label_x)
print(binarizer.transform(label_x))
Esempio n. 51
0
print(X_.shape)
# 一次交互项:x1, x2, x3, x4
# 二次交互项: x1*x2, x1*x3, x1*x4, x2*x3, x2*x4, x3*x4
# 三次交互项:x1*x2*x3, x1*x2*x4, x1*x3*x4, x2*x3*x4

######################################################################
######## Part4. 类型转换(离散化、数字化、哑变量化)
######################################################################

### 二值离散化{0,1}
# 大于threshold都标记为1,小于等于threshold的都标记为0.
from sklearn.preprocessing import Binarizer

cols = ['年龄', '收入']

est = Binarizer(threshold=50)
X_ = est.fit_transform(df[cols])
print(X_)

### 多值离散化,分箱数n_bins
from sklearn.preprocessing import KBinsDiscretizer

est = KBinsDiscretizer(n_bins=5, encode='ordinal')  #0~n_bins-1
X_ = est.fit_transform(df[cols])
print(X_)

# KBinsDiscretizer
# (n_bins=5, encode=’onehot’, strategy=’quantile’)
#n_bins : int or array-like, shape (n_features,) (default=5)
#   分箱数,不能小于2
# encode : {‘onehot’, ‘onehot-dense’, ‘ordinal’}, (default=’onehot’)
Esempio n. 52
0
	def by_threshold(self, threshold=0.0):
		bin = Skbin(threshold).fit(self.M)
		return bin.transform(self.M)
# Show data
print(houses)

print("\n")
print("\n")
print("\n")

#4.8 Discretizating Features

# Load libraries
import numpy as np
from sklearn.preprocessing import Binarizer
# Create feature
age = np.array([[6], [12], [20], [36], [65]])
# Create binarizer
binarizer = Binarizer(18)
# Transform feature
binarizer.fit_transform(age)
# Bin feature
np.digitize(age, bins=[20, 30, 64])
# Bin feature
np.digitize(age, bins=[20, 30, 64], right=True)

# Bin feature
np.digitize(age, bins=[18])

#4.9 Grouping Observations Using Clustering
# Load libraries
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
def random_forest_regressor(X, y, threshold, value, k_fold=5):
    X = np.asarray(X)
    y = np.asarray(y)
    y[y == 'NOISE'] = 'a'
    y_unique = np.unique(y)
    #enc = ColumnTransformer([("noise", OneHotEncoder(sparse = False, handle_unknown = 'error'), [0])], remainder = 'passthrough')
    #cat = [['noise', 'dusky', 'ratufa']]
    enc = OneHotEncoder(categories='auto',
                        sparse=False,
                        handle_unknown='error')
    y_regressor = enc.fit_transform(y.reshape(y.shape[0], 1))

    # dividing X, y into train and test data
    sss = StratifiedShuffleSplit(n_splits=k_fold,
                                 test_size=0.2,
                                 random_state=0)

    # Do K fold cross validation
    all_cms = []
    all_accuracies = []
    tp_array = []
    fp_array = []
    tn_array = []
    fn_array = []
    print('Doing {} fold cross validation predictions. Classes: {}'.format(
        k_fold, np.unique(y)))
    for k, (train_index, test_index) in enumerate(sss.split(X, y_regressor)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_regressor[train_index], y_regressor[test_index]
        y_test_cat = enc.inverse_transform(y_test)
        # training a classifier
        clf = RandomForestRegressor(random_state=0, n_estimators=100)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        print(predictions[0:10])
        predictions = Binarizer(threshold=threshold).fit_transform(predictions)
        print(predictions[0:10])
        predictions_cat = enc.inverse_transform(predictions)
        print(predictions_cat[0:10])
        y_test_cat[y_test_cat == 'usky'] = 'dusky'
        predictions_cat[predictions_cat == 'usky'] = 'dusky'
        # model accuracy for X_test
        class_scores = f1_score(y_test_cat, predictions_cat, average=None)
        print('{}/{} folds mean accuracy: {}'.format(k + 1, k_fold,
                                                     np.mean(class_scores)))
        all_accuracies.append(class_scores)

        cm_labels = np.unique(y)
        k_cm = confusion_matrix(y_test_cat, predictions_cat, labels=cm_labels)
        FP = k_cm.sum(axis=0) - np.diag(k_cm)
        FN = k_cm.sum(axis=1) - np.diag(k_cm)
        TP = np.diag(k_cm)
        TN = k_cm.sum().sum() - (FP + FN + TP)
        tp_array.append(TP)
        fp_array.append(FP)
        tn_array.append(TN)
        fn_array.append(FN)
        all_cms.append(k_cm)

    # Get averages across K fold cross validation
    final_tp = np.mean(np.asarray(tp_array), axis=0)
    final_tn = np.mean(np.asarray(tn_array), axis=0)
    final_fp = np.mean(np.asarray(fp_array), axis=0)
    final_fn = np.mean(np.asarray(fn_array), axis=0)
    cm_values = [final_tp, final_tn, final_fp, final_fn]
    accuracies = np.mean(np.asarray(all_accuracies), axis=0)
    average_accuracy = np.mean(accuracies)
    print('Average accuracy = {}'.format(average_accuracy))

    cm = np.mean(np.asarray(all_cms), axis=0)

    return cm, cm_labels, average_accuracy, accuracies, cm_values
X = (news_data * lasso_est.transpose()) # multiply element wise with lasso estimate
df_Lasso = X[X.columns[(X != 0).any()]] # remove columns where all elements are zero
print df_Lasso.shape # number of columns should significantly shrink depending on choice of alpha
df_Lasso.columns.values.tolist()


# In[104]:

#obtain a split
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(df_Lasso, news_labels)

#binarize
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=binary_threshold)
binary_labels = binarizer.transform(news_labels).transpose().ravel()     # .ravel() is to fix "Too many array indices error"
print binary_labels.shape


# In[107]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score

knn = KNeighborsClassifier(n_neighbors=1) # arbitrary k
cv = cross_val_score(knn, df_Lasso, binary_labels, cv=10)
print "Cross Validation Scores"
print cv
print 'Mean Cross Validation Score'
print np.mean(cv)
Esempio n. 56
0
def binarizer(args):
    #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer

    return Binarizer(threshold=args['threshold'], copy=True)
Esempio n. 57
0
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -36.33392193683913
exported_pipeline = make_pipeline(
    Binarizer(threshold=0.2),
    RidgeCV()
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Esempio n. 58
0
	tokenizer = tokenize(X_train)
	X_tokens = tokenizer.transform(X_train)

	# Train Recurrent Neural Network
	model = train_RNN(tokenizer, X_tokens, y_train)

	y_pred_tr = model.predict(X_tokens).flatten()

	# Check overall performance
	test_tokens = tokenizer.transform(X_test)
	y_pred_tst = model.predict(test_tokens).flatten()

	# Conver predictions to binary
	yhat_train = y_pred_tr.reshape(-1, 1)
	yhat_test  = y_pred_tst.reshape(-1, 1)
	binarizer = Binarizer(threshold=0.5).fit(yhat_train)
	yhat_tr_b = binarizer.transform(yhat_train).astype(int)
	yhat_tst_b = binarizer.transform(yhat_test).astype(int)

    save(model, review_score_full.pkl)

    with open('review_tokenizer_full.pkl', 'wb') as fileObject:
        pickle.dump(tokenizer, fileObject)

    # # Save model for future use
    # save(model, 'review_scorer1.pkl')
    # # model = load('review_scorer.pkl')
    # with open('review_tokenizer1.pkl','wb') as fileObject:
    #     pickle.dump(tokenizer, fileObject)

	# Scorers to consider
 def fp_vectorizer(self, processed_data):
     binarizer = Binarizer(threshold = 5)
     vectorized_data = binarizer.fit_transform(processed_data)
     return vectorized_data