Ejemplo n.º 1
0
x_std[7] = (7-1)/(7-1)
'''

print('---------- Standard Scaing -----------')
# Standard Scalar
standard = preprocessing.StandardScaler().fit(x)
print(standard.transform(x))
'''
x_std[0] = (1-4)/np.std(x[:,0]
'''
print('--------------------------------------')

#Binarizer scaling
'''
Giving the threshold for the each data. eg in the Neural net 
'''
print('------------ Binarizer --------------------')
print(preprocessing.Binarizer(3.0).fit(x).transform(x))
print('-------------------------------------------')

#Normalize
'''
x0 = 1
norm0 = math.sqrt(1+4+9)
x0/norm0
'''
print('--------- Normalize ------------')
print(preprocessing.Normalizer().fit(x).transform(x))

print('--------------------------------')
 def _setBinarizer(self, a_df: pd.DataFrame = pd.DataFrame()) -> pd.DataFrame:
     return pd.DataFrame(preprocessing.Binarizer(threshold=1.4).transform(a_df), columns=a_df.columns,
                         index=a_df.index)
Ejemplo n.º 3
0
#标准化(均值移除)
data_standardized = preprocessing.scale(data)
print("\nMean =", data_standardized.mean(axis=0))
print("Std deviation =", data_standardized.std(axis=0))

#归一化
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)
print("\nMin max scaled data =", data_scaled)

#中心化
data_normalized = preprocessing.normalize(data, norm='l1')
print("\nL1 normalized data =", data_normalized)

#二值化
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print("\nBinarized data =", data_binarized)

#独热编码
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print("\nEncoded vector =", encoded_vector)
'''
分析:
4个特征:
第一个特征(即为第一列)为[0,1,2,1] ,其中三类特征值[0,1,2],因此One-Hot Code可将[0,1,2]表示为:[100,010,001]
同理第二个特征列可将两类特征值[2,3]表示为[10,01]
第三个特征将4类特征值[1,2,4,5]表示为[1000,0100,0010,0001]
第四个特征将2类特征值[3,12]表示为[10,01]
因此最后可将[2,3,5,3]表示为[0,0,1,0,1,0,0,0,1,1,0]
Ejemplo n.º 4
0
import pandas as pd
import sklearn.ensemble as se
import sklearn.metrics as sm
import sklearn.model_selection as ms
import sklearn.preprocessing as sp
import numpy as np
import sklearn.svm as svm

train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')
train_data_y = train_data['label'].values
print(type(train_data_y))
train_data_x = train_data.drop('label', axis=1)
print(type(train_data_x))
one_zero = sp.Binarizer(threshold=0)
test_data = one_zero.transform(test_data)
train_data_x = one_zero.transform(train_data_x)
train_x, test_x, train_y, test_y = ms.train_test_split(train_data_x,
                                                       train_data_y,
                                                       test_size=0.15,
                                                       random_state=4)
# params = [{'max_depth':[35,40], 'n_estimators':[1600,1800]}]
# # model = ms.GridSearchCV(se.RandomForestClassifier(random_state=4), params, cv=3)
# model.fit(train_x, train_y)
# for param, score in zip(model.cv_results_['params'], model.cv_results_['mean_test_score']):
#     print(param, score)
# print(model.best_params_)
# print(model.best_score_)
# print(model.best_estimator_)
# model = se.RandomForestClassifier(max_depth=35, n_estimators=1800,random_state=3)
model = svm.SVC(kernel='poly', degree=7)
def binarization():
    data_binarized = preprocessing.Binarizer(
        threshold=1.4).transform(input_data)
    print("\nBinarized data =", data_binarized)
Ejemplo n.º 6
0
if __name__ == '__main__':
    ls = lstm(7, 7, 10)
    train_data = reberGrammar.get_n_embedded_examples(1000)
    error = []
    t1 = time.clock()
    for i in xrange(60):
        print '\n', i, '/60'
        err = 0
        for x, y in train_data:
            tmp = ls.train(x, y)
            err += tmp
            print tmp, '\r',
        #print ls.predict(train_data[0][0])
        error.append(err)
    print 'time:', time.clock() - t1
    plt.plot(np.arange(60), error, 'b-')
    plt.xlabel('epochs')
    plt.ylabel('error')
    plt.show()
    print error
    test_data = reberGrammar.get_n_embedded_examples(100)
    binarizer = preprocessing.Binarizer(threshold=0.1)
    error = 0
    for x, y in test_data:
        y_pred = ls.predict(x)
        #print y_pred
        y_pred = binarizer.transform(y_pred)
        for a, b in zip(y, y_pred):
            error += np.mean(a - b)
    print error
Ejemplo n.º 7
0
  def fit_transform(self, dataset):
    """
      Transform a datframe <dataset> into a feature matrix

      params:
      dataset : Pandas DataFrame, the input dataset

      Returns a matrix N samples x M features
    """

    ###First step, select some fields we care about, all of these are numeric, so we can just pick them out
    data = np.array(dataset[[ 'age', 
     'NumberOfDependents' #, 'NumberOfOpenCreditLinesAndLoans', 'RevolvingUtilizationOfUnsecuredLines'
     ]]) # Can add in additional features here

    #dataset['age_to_linescredit'] = dataset['age'] / dataset['RevolvingUtilizationOfUnsecuredLines']
 
    #dataset['debt_to_monthlyincome'] = dataset['DebtRatio'] / dataset['MonthlyIncome']
    

    # debt ratio to monthly income relationship

    # ## You want to perform some more interesting transformations of the data
    # ## For example, ratios
    # dataset['dollar_per_year'] = dataset['MonthlyIncome'] / dataset['age']

    ## One preprocesing step we will need to perform is imputation, fill in missing values
    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
    data = imputer.fit_transform(data)

    #return np.hstack([data])

    ## Scaling features may be important you have very large outliers or need more intepretable coefficients
    scaler = preprocessing.StandardScaler()
    scaled_income = scaler.fit_transform(data[:,1])

    # Added by Will
    #scaled_debtratio = scaler.fit_transform(data[:,1])

    data =  np.column_stack([data, scaled_income])

    # ## Turning features into discrete features is important if you are using linear classifier, but the underlying 
    # ## data does not have a linear relationship

    ## NOTE: the binarizer, turns everything > 0 to 1 and and everything less than 0 to 0, so use the StandardScaler first
    binarizer = preprocessing.Binarizer()
    #binned_income = binarizer.fit_transform(scaled_income)
    
    binned_numberoftimes90dayslate = binarizer.fit_transform(dataset['NumberOfTimes90DaysLate'])
    binned_numberoftime3059dayspastduenotworse = binarizer.fit_transform(dataset['NumberOfTime30-59DaysPastDueNotWorse'])
    binned_numberoftime6089dayspastduenotworse = binarizer.fit_transform(dataset['NumberOfTime60-89DaysPastDueNotWorse'])
    binned_numberofopencreditlinesandloans = binarizer.fit_transform(dataset['NumberOfOpenCreditLinesAndLoans'])
    binned_revolvingutilizationofunsecuredlines = binarizer.fit_transform(dataset['RevolvingUtilizationOfUnsecuredLines'])
    binned_debtratio = binarizer.fit_transform((.5 - dataset['DebtRatio']))

    # Default
    #data =  np.column_stack( [data,binned_income ] ) # column_stack is pretty much an append
    #data =  np.column_stack( [data,binned_income ] ) # column_stack is pretty much an append

    data =  np.column_stack( [data, binned_numberoftimes90dayslate] )
    data =  np.column_stack( [data, binned_numberoftime3059dayspastduenotworse] )
    data =  np.column_stack( [data, binned_numberoftime6089dayspastduenotworse] )
    data =  np.column_stack( [data, binned_numberofopencreditlinesandloans] )
    data =  np.column_stack( [data, binned_revolvingutilizationofunsecuredlines])
    data =  np.column_stack( [data, binned_debtratio])
    return data
Ejemplo n.º 8
0
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
train_data = pd.read_csv("/home/amitoj/Downloads/data.csv")
del train_data['id']

X = train_data[train_data.columns[1:30]].values
Y = train_data.loc[:, ['diagnosis']].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=0)

X_Train = preprocessing.Binarizer().fit_transform(X_train)
print(X_Train)
X_Test = preprocessing.Binarizer(threshold=1.0).fit_transform(X_test)
print(X_Test)
Y_train = (np.ravel(np.array(y_train)))
Y_test = (np.ravel(np.array(y_test)))

le = preprocessing.LabelEncoder()
Y_Train = le.fit_transform(Y_train)
Y_Test = le.fit_transform(Y_test)
# print(Y_Train)
# print(len(Y_Test))

Nb_clf = BernoulliNB()
print(Nb_clf.fit(X_train, Y_Train))
prediction = Nb_clf.predict(X_Test)
Ejemplo n.º 9
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
demo04_bin.py   二值化
"""
import numpy as np
import sklearn.preprocessing as sp

raw_samples = np.array([[17., 100., 4000], [20., 80., 5000], [23., 75., 5500]])

bin = sp.Binarizer(threshold=80)
r = bin.transform(raw_samples)
print(r)

raw_samples[raw_samples <= 80] = 0
raw_samples[raw_samples > 80] = 1
print(raw_samples)
Ejemplo n.º 10
0
import pandas as pd
df = pd.DataFrame(features, columns=['features_1', 'features_2'])
print(df.apply(add_ten))

#处理异常点
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]
houses_re = houses[houses['Bathrooms'] < 20]
houses["outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)
print(houses)

age = np.array([[6], [12], [20], [36], [65]])
binarier = preprocessing.Binarizer(18)
print(binarier.fit_transform(age))
#multi threholds
print(np.digitize(age, bins=[20, 30, 64]))

#kmeans
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
features, _ = make_blobs(n_samples=50, n_features=2, centers=3, random_state=1)
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
clusterer = KMeans(3, random_state=0)
clusterer.fit(features)
dataframe['group'] = clusterer.predict(features)
print(dataframe.head(5))

#删除缺失值
Ejemplo n.º 11
0
def main():
    dataframe = extractData(
    )  #pd.DataFrame([[1,"v",2],[1,2,3]],columns=["name1","name2","name3"])
    # 'Binarizer',
    preprocessing.Binarizer().fit_transform()
Ejemplo n.º 12
0
 def create_binary_cols(df, cols=[], thresh=0.1):
     cols = cols or df.columns
     for c in cols:
         binzr = preprocessing.Binarizer(thresh).fit(df[c])
         df[c + "_bin"] = binzr.transform(df[c])
     return df
Ejemplo n.º 13
0
    np.set_printoptions(precision=3)
    print('MinMaxScaler transformed data:\n{0}\n'.format(rescaledX[0:5, :]))

    # ----------------------------------------
    # multiple transformations
    # ----------------------------------------
    scalers = dict()

    # transform so that smallest data is 0, and largest is 1
    scaler = pp.MinMaxScaler(feature_range=(0, 1)).fit(X)
    scalers['MinMaxScaler'] = (scaler, scaler.transform(X))

    # transform so that data is standard normal Gaussian distribution; ie, mean
    # of 0 and standard deviation of 1
    scaler = pp.StandardScaler().fit(X)
    scalers['StandardScaler'] = (scaler, scaler.transform(X))

    # transform so that the length of each observation (row) has a length of 1
    # (unit vector)
    scaler = pp.Normalizer().fit(X)
    scalers['Normalizer'] = (scaler, scaler.transform(X))

    # transform so that all values above a threshold are 1 and all values below
    # a threshold are 0
    scaler = pp.Binarizer(threshold=2).fit(X)
    scalers['Binarizer'] = (scaler, scaler.transform(X))

    # display results of transformations
    for entry in scalers.items():
        print('{0} transformed data:\n{1}\n'.format(entry[0],
                                                    entry[1][1][0:5, :]))
data = df['amount']  # 获取要聚类的数据,名为amount的列
data_reshape = data.reshape((data.shape[0], 1))  # 转换数据形状
model_kmeans = KMeans(n_clusters=4, random_state=0)  # 创建KMeans模型并指定要聚类数量
keames_result = model_kmeans.fit_predict(data_reshape)  # 建模聚类
df['amount2'] = keames_result  # 新离散化的数据合并到原数据框
print(df.head(5))  # 打印输出前5条数据
# 方法3:使用4分位数实现离散化
df['amount3'] = pd.qcut(df['amount'],
                        4,
                        labels=['bad', 'medium', 'good',
                                'awesome'])  # 按四分位数进行分隔
df = df.drop('amount', 1)  # 丢弃名为amount的列
print(df.head(5))  # 打印输出前5条数据

# 针对连续数据的二值化
binarizer_scaler = preprocessing.Binarizer(
    threshold=df['income'].mean())  # 建立Binarizer模型对象
income_tmp = binarizer_scaler.fit_transform(df['income'])  # Binarizer标准化转换
income_tmp.resize(df['income'].shape)  # 转换数据形状
df['income'] = income_tmp  # Binarizer标准化转换
print(df.head(5))  # 打印输出前5条数据

####################################################################
# 3.12.1 网页数据解析
# 导入库
import requests  # 用于请求
from bs4 import BeautifulSoup  # 用于HTML格式化处理
import re  # 用于HTML配合查找条件
import time  # 用于文件名保存


# 获取总页面数量
Ejemplo n.º 15
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing

dataset = pandas.read_csv('mlExcel_2.csv', header=0)
originalHeaders = list(dataset.columns.values)
array = dataset.values

# converts winning to binary
Y = array[:, -1]
for string in Y:
    string = float(string)
Y = Y.reshape(-1, 1)
binarizer = preprocessing.Binarizer().fit(Y)
Y = binarizer.transform(Y)
Y = Y.reshape(len(Y))

dataset = dataset._get_numeric_data()
numericHeaders = list(dataset.columns.values)
array = dataset.values
X = array[:, 0:-1]

validationSize = 0.20
seed = 7
scoring = 'accuracy'

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size=validationSize, random_state=seed)
 print("origin data")
 print(np.mean(features, axis=0))
 print(np.std(features, axis=0))
 features_new = preprocessing.StandardScaler().fit_transform(features)
 # print(features_new)
 print(np.mean(features_new, axis=0))
 print(np.std(features_new, axis=0))
 """1.1.2 区间缩放:将特征值缩放到[0, 1]区间的数据(对列向量处理)"""
 features_new = preprocessing.MinMaxScaler().fit_transform(features)
 print("max mean")
 print(np.mean(features_new, axis=0))
 """1.1.3 归一化:将行向量转化为“单位向量”(对每个样本处理)"""
 features_new = preprocessing.Normalizer().fit_transform(features)
 print(features_new)
 """1.2 对定量特征二值化:设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0"""
 features_new = preprocessing.Binarizer(threshold=6).fit_transform(features)
 print(features_new)
 """
 1.3 对定性(分类)特征编码
 1.3.1 one-hot (也可用pandas.get_dummies函数)
 1.3.2 label-encoder (略)
 """
 enc = preprocessing.OneHotEncoder()
 enc.fit(features)
 result = preprocessing.OneHotEncoder().fit_transform(features)
 print(features[0:5])
 print(result[0:5].toarray())
 print(enc.transform([[0, 1, 3, 1]]).toarray())
 """# 1.4 缺失值计算(也可用pandas.fillna函数)"""
 imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
 null_data = vstack((array([nan, nan, nan, nan]), features))
Ejemplo n.º 17
0
def HSBF_main(mode, clf_index, runtimes):

    pwd = os.getcwd()
    print(pwd)
    father_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + ".")
    # print(father_path)
    datapath = father_path + '/dataset-inOne/'
    spath = father_path + '/results/'
    if not os.path.exists(spath):
        os.mkdir(spath)

    # datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv', 'ivy-1.4.csv', 'jedit-3.2.csv', 'log4j-1.0.csv', 'lucene-2.0.csv', 'poi-2.0.csv', 'redaktor-1.csv', 'synapse-1.0.csv', 'tomcat-6.0.389418.csv', 'velocity-1.6.csv', 'xalan-2.4.csv', 'xerces-init.csv'],
    #         ['ant-1.7.csv', 'arc-1.csv', 'camel-1.6.csv', 'ivy-2.0.csv', 'jedit-4.3.csv', 'log4j-1.1.csv', 'lucene-2.0.csv', 'poi-2.0.csv', 'redaktor-1.csv', 'synapse-1.2.csv', 'tomcat-6.0.389418.csv', 'velocity-1.6.csv', 'xalan-2.6.csv', 'xerces-1.3.csv'],
    #         ['EQ.csv', 'JDT.csv', 'LC.csv', 'ML.csv', 'PDE.csv'],
    #         ['Apache.csv', 'Safe.csv', 'Zxing.csv']]
    datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv'],
                ['Apache.csv', 'Safe.csv', 'Zxing.csv']]
    datanum = 0
    for i in range(len(datasets)):
        datanum = datanum + len(datasets[i])
    # print(datanum)
    # mode = [preprocess_mode, train_mode, save_file_name]
    preprocess_mode = mode[0]
    train_mode = mode[1]
    save_file_name = mode[2]
    df_file_measures = pd.DataFrame(
    )  # the measures of all files in all runtimes
    classifiername = []
    # file_list = os.listdir(fpath)

    n = 0
    for i in range(len(datasets)):
        for file_te in datasets[i]:
            n = n + 1
            print('----------%s:%d/%d------' % ('Dataset', n, datanum))
            # print('testfile', file_te)
            start_time = time.time()

            Address_te = datapath + file_te
            Samples_te = NumericStringLabel2BinaryLabel(
                Address_te)  # DataFrame
            data = Samples_te.values  # DataFrame2Array
            X = data[:, :-1]  # test features
            y = data[:, -1]  # test labels
            Sample_tr0 = NumericStringLabel2BinaryLabel(Address_te)
            column_name = Sample_tr0.columns.values  # the column name of the data

            df_r_measures = pd.DataFrame(
            )  # the measures of each file in runtimes
            for r in range(runtimes):
                if train_mode == 'M2O_CPDP':  # train data contains more files different from the test data project
                    X_test = X
                    y_test = y
                    Samples_tr_all = pd.DataFrame(
                    )  # initialize the candidate training data of all cp data
                    trfilelist = []
                    for file_tr in datasets[i]:
                        if file_tr != file_te:
                            # print('train_file:', file_tr)
                            Address_tr = datapath + file_tr
                            trfilelist.append(Address_tr)
                            Samples_tr = NumericStringLabel2BinaryLabel(
                                Address_tr)  # original train data, DataFrame
                            Samples_tr.columns = column_name.tolist()  # 批量更改列名
                            Samples_tr_all = pd.concat(
                                [Samples_tr_all, Samples_tr],
                                ignore_index=False,
                                axis=0,
                                sort=False)
                    # Samples_tr_all.to_csv(f2, index=None, columns=None)  # 将类标签二元化的数据保存,保留列名,不增加行索引

                    # random sample 90% negative samples and 90% positive samples
                    # string = 'bug'
                    # Sample_tr_pos, Sample_tr_neg, Sample_pos_index, Sample_neg_index \
                    #     = Random_Stratified_Sample_fraction(Samples_tr_all, string, r=r)
                    # Sample_tr = np.concatenate((Sample_tr_neg, Sample_tr_pos), axis=0)  # array垂直拼接
                    # data_train_unique = Drop_Duplicate_Samples(pd.DataFrame(Sample_tr))  # drop duplicate samples
                    # source = data_train_unique.values
                    # target = np.c_[X_test, y_test]

                    # *******************HSBF*********************************
                    method_name = mode[1] + '_' + mode[
                        2]  # scenario + filter method
                    print('----------%s:%d/%d------' %
                          (method_name, r + 1, runtimes))
                    df_filter_time, X_train_new, y_train_new, = HSBF(
                        trfilelist, Address_te, k1=10, k2=20, r=r)
                    y_train_new = preprocessing.Binarizer(
                        threshold=0).transform(y_train_new.reshape(-1, 1))  #

                    # Train model: classifier / model requires the label must beong to {0, 1}.
                    modelname_hsbf, model_hsbf = Selection_Classifications(
                        clf_index, r)  # select classifier
                    classifiername.append(modelname_hsbf)
                    # print("modelname:", modelname)
                    measures_hsbf = Build_Evaluation_Classification_Model(
                        model_hsbf, X_train_new, y_train_new, X_test,
                        y_test)  # build and evaluate models

                    end_time = time.time()
                    run_time = end_time - start_time
                    measures_hsbf.update({
                        'train_len_before':
                        len(Samples_tr_all),
                        'train_len_after':
                        len(X_train_new),
                        'test_len':
                        len(X_test),
                        'runtime':
                        run_time,
                        'clfindex':
                        clf_index,
                        'clfname':
                        modelname_hsbf,
                        'testfile':
                        file_te,
                        'trainfile':
                        'More1',
                        'runtimes':
                        r + 1
                    })
                    df_m2ocp_measures = pd.DataFrame(measures_hsbf, index=[r])
                    # print('df_m2ocp_measures:\n', df_m2ocp_measures)
                    df_r_measures = pd.concat(
                        [df_r_measures, df_m2ocp_measures],
                        axis=0,
                        sort=False,
                        ignore_index=False)
                else:
                    pass

            # print('df_file_measures:\n', df_file_measures)
            # print('所有文件运行一次的结果为:\n', df_file_measures)
            df_file_measures = pd.concat(
                [df_file_measures, df_r_measures],
                axis=0,
                sort=False,
                ignore_index=False)  # the measures of all files in runtimes
            # df_r_measures['testfile'] = file_list
            # print('df_r_measures1:\n', df_r_measures)

    modelname = np.unique(classifiername)
    # pathname = spath + '\\' + (save_file_name + '_clf' + str(clf_index) + '.csv')
    pathname = spath + '\\' + (save_file_name + '_' + modelname[0] + '.csv')
    df_file_measures.to_csv(pathname)
    # print('df_file_measures:\n', df_file_measures)
    return df_file_measures
# 2. Min max scaling X_mm

minmax_scaler = preprocessing.MinMaxScaler().fit(X)
X_mm = minmax_scaler.transform(X)
# let comprae min and max of first features of X and X_mm
print("compare min and max of first features of X and X_mm")
f1_X = X[:, 0]
print('%.2f %.2f' % (np.min(f1_X), np.max(f1_X)))

f1_Xmm = X_mm[:, 0]
print('%.2f %.2f' % (np.min(f1_Xmm), np.max(f1_Xmm)))

# 3. Binarizing X_Binarize

X_Binarize = preprocessing.Binarizer(0.0).fit(X).transform(X)
# let comprae Binarizing of first features of X and X_Binarize
print("compare Binarizing of first features of X and X_Binarize")
f1_X = X[0]
print(f1_X)

f1_X_Binarize = X_Binarize[0]
print(f1_X_Binarize)

# 3. Normalizing

X_Normalize = preprocessing.Normalizer().fit(X).transform(X)
# let comprae Binarizing of first features of X and X_Normalize
print("compare Normalizing of first features of X and X_Normalize")
f1_X = X[0]
print(f1_X)
Ejemplo n.º 19
0
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features,
                         Norm, ontology):
    # Replace tab separated csv into comma separated csv and replace categorial variables into iteration
    lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc")
    onto = open(ontology, "w")
    writer = csv.writer(onto, lineterminator=',')
    class_number = 1
    onto.write("Iteration,Class,Class_number,Neuron_name\n")
    Iteration = 1
    for root, dirs, files in os.walk(Extracted_Features):
        for i in files:
            if not i.startswith('.'):
                #LVLprint i
                input_i = Extracted_Features + i
                output_i = Coma_Features + i
                file = open(output_i, "w")
                writer = csv.writer(file, lineterminator=',')
                lines = tools.file_lines(input_i) + 1
                ncol = tools.file_col(input_i) - 1
                for line in xrange(lines):
                    for col in xrange(ncol):
                        if line == 0:
                            if col == 1:  # Skipping neuron names
                                #print "skip neuron name"
                                # pour faire propre je devrais inverser
                                laurent = 1
                            else:
                                file.write(
                                    "%s," %
                                    tools.read_csv_tab(input_i, col, line))
                        else:
                            if col == 0:  # replace class names by an integer
                                file.write("%i," % class_number)

                            else:
                                if col == 1:
                                    #print "skip neuron name"
                                    onto.write("%i,%s,%i,%s\n" %
                                               (Iteration, i, class_number,
                                                tools.read_csv_tab(
                                                    input_i, col, line)))
                                    Iteration = Iteration + 1
                                else:
                                    file.write(
                                        "%s," %
                                        tools.read_csv_tab(input_i, col, line))
                    file.write("\n")
                file.close()
                class_number = class_number + 1
                if lines > 3:
                    input_file = Coma_Features + i
                    data = np.loadtxt(
                        input_file,
                        delimiter=',',
                        usecols=range(ncol - 1),
                        skiprows=1)  # ncol-1 because we skip the class names
                    X = data[:, :ncol]
                    y = data[:, 0].astype(np.int)  # Labels (class)
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN',
                                  strategy='mean',
                                  axis=0)
                    imp.fit(X)
                    Imputer(axis=0,
                            copy=True,
                            missing_values='NaN',
                            strategy='mean',
                            verbose=0)
                    # Output replacement "Nan" values
                    Y = imp.transform(X)
                    #Data Standardization
                    if Norm == 'normalize':
                        Z = preprocessing.normalize(Y, axis=0,
                                                    norm='l2')  # Normalize
                    else:
                        if Norm == 'binarize':
                            binarizer = preprocessing.Binarizer().fit(
                                Y)  # Binarize for Bernoulli
                            Z = binarizer.transform(Y)
                        else:
                            if Norm == 'standardize':
                                min_max_scaler = preprocessing.MinMaxScaler(
                                )  # Normalize the data to [0,1]
                                Z = min_max_scaler.fit_transform(Y)
                            else:
                                Z = preprocessing.scale(Y)  #Scaling

                    #Create new files with corrected and standardized data
                    output_file = Corrected_Features + i
                    file = open(output_file, "w")
                    writer = csv.writer(file, lineterminator=',')
                    for line_1 in xrange(lines - 1):
                        for col_1 in xrange(ncol - 1):
                            if col_1 == 0:
                                file.write("%s," % y[line_1])
                            else:
                                file.write("%f," % Z[line_1, col_1])
                        file.write("\n")
                    file.close()
                else:
                    #print "skip class" # We skip the class with not enough data
                    # pour faire propre je devrais inverser
                    laurent = 1
    onto.close()
    lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
Ejemplo n.º 20
0
var = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup',
       'paid','activities','nursery','higher','internet','romantic']
"""for v in var:
    print('\nFrequency count for variable %s'%v) 
    print(dataset[v].value_counts())
    """
    
#label encode 
from sklearn.preprocessing import LabelEncoder
var_to_encode = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup',
       'paid','activities','nursery','higher','internet','romantic']
for col in var_to_encode:
    dataset[col] = LabelEncoder().fit_transform(dataset[col])

# Binarize G3<=11: G3=0   G3>11: G3=1
dataset[['G3']] = preprocessing.Binarizer(threshold=11).transform(dataset[['G3']])

x=dataset[dataset.columns.drop('G3')]
y= dataset['G3']

# divide dataset into train set and test set, size of test equals = 0.33*size of dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
     x,y,test_size=0.33, random_state=0)



#--------------------------------------END OF PREPROCESSING----------------------
from sklearn import neighbors as nb
kmax=50
Ejemplo n.º 21
0
​
​
[35]

# 5-  Feature Creation
# Foi criada uma Feature para agrupar o tamanho da população e o idade da casa
​
dframe['FamilySize'] = dframe['Population'] + dframe['HouseAge']
dframe.head()
[38]

# 6- Discretization and Binarization
​
from sklearn import preprocessing
​
binarizer = preprocessing.Binarizer().fit('Population')
print(binarizer)
binarizer.threshold=3.50
[50]

# 7- Attribute Transformation
​
def draw_missing_data_table(dframe):
    total = dframe.isnull().sum().sort_values(ascending=False)
    percent = (dframe.isnull().sum()/dframe.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
  
​
​
​
Ejemplo n.º 22
0
from sklearn import preprocessing

boston = datasets.load_boston()

#Similar to scaling, we have function and Class
#preprocessing.binarize
#preprocessing.Binarizer

new_target = preprocessing.binarize(boston.target,
                                    threshold=boston.target.mean())
print("New Target after Binarization :")
print(new_target[:5])
print("To Verify :")
print((boston.target[:5] > boston.target.mean()).astype(int))

bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
print(new_target[:5])

#-----------------------------------------------
# special case for sparse matrix. threshold cannot be less than zero

from scipy.sparse import coo
spar = coo.coo_matrix(np.random.binomial(1, 0.25, 100))
#preprocessing.binarize(spar,threshold=-1)

#-------------------------------------------------
# working with categorical variable
iris = datasets.load_iris()
X = iris.data
y = iris.target
Ejemplo n.º 23
0
import numpy as np
from sklearn import preprocessing

#샘플 데이터 정의
input_data = np.array([[3.1, 2.9, 3.3], [-1.2, 7, 6.1], [3.13, -3.13, 20],
                       [7.28, -9.9, -2.5]])

#데이터 이진화
data_binarized = preprocessing.Binarizer(threshold=2.1).transform(input_data)
print("\nBinarized data : \n", data_binarized)

#평균과 표준편차 출력
print("\n BEFORE : ")
print("Mean = ", input_data.mean(axis=0))
print("Std deviation = ", input_data.std(axis=0))

#평균 제거
data_scaed = preprocessing.scale(input_data)
print("\n AFTER : ")
print("Mean = ", data_scaed.mean(axis=0))
print("Std deviation = ", data_scaed.std(axis=0))

#크기 조정
#최솟값 /최댓값 조정
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled_minmax = data_scaler_minmax.fit_transform(input_data)
print("\nMin max scaled data : \n", data_scaled_minmax)

#정규화
#데이터 정규화
data_normalized_l1 = preprocessing.normalize(input_data, norm='l1')
Ejemplo n.º 24
0
# Creating a new feature for outlier:
houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)
# adding new feature based on square meter
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]
print(houses)


# Discretizating Features (creating categories)

age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

binarizer = preprocessing.Binarizer(20)
print(binarizer.fit_transform(age))

# using multibple treshholds
print(np.digitize(age, bins=[20, 30, 64], right=True))  # 0 --> LESS than 20; right=True --> LESS THAN OR EQUAL TO 20


# Grouping Observations Using Clustering

# Make simulated feature matrix
features, _ = make_blobs(n_samples=50,
                         n_features=2,
                         centers=3,
                         random_state=1)
dataframe = pandas.DataFrame(features, columns=["feature_1", "feature_2"])
# Make k-means clusterer
Ejemplo n.º 25
0
    # # pyplot.plot(voting, label='MLP')
    #
    # pyplot.legend()
    # pyplot.xlabel('Time')
    # pyplot.ylabel('USD/TRY')
    # pyplot.show()


x_train, y_train, x_test, y_test = load_data(All)
x_train, y_train, x_test, y_test = convert_to_numpy(x_train, y_train, x_test, y_test)

min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler2 = preprocessing.MinMaxScaler()
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
quantile_transformer2 = preprocessing.QuantileTransformer(random_state=0)
binarizer = preprocessing.Binarizer()
binarizer2 = preprocessing.Binarizer()
max_abs_scaler = preprocessing.MaxAbsScaler()
max_abs_scaler2 = preprocessing.MaxAbsScaler()
yeo_johnson_power_transformer = preprocessing.PowerTransformer(standardize=False)
yeo_johnson_power_transformer2 = preprocessing.PowerTransformer(standardize=False)
yeo_johnson_power_transformer_standardized = preprocessing.PowerTransformer()
yeo_johnson_power_transformer_standardized2 = preprocessing.PowerTransformer()

normalized_train_x = x_train
normalized_test_x = x_test
print('With No Alteration')
run_regressors()

normalized_train_x = preprocessing.scale(x_train)
normalized_test_x = preprocessing.scale(x_test)
      (np.max(x_train), np.min(x_train)))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
print('after scalling, max is %d and min is %d' %
      (np.max(x_train), np.min(x_train)))
x_train

# In[12]:

from sklearn import preprocessing
X_scaled = preprocessing.scale(x_train)
X_scaled
X_scaled.mean(axis=0)  # mean is zero
X_scaled.std(axis=0)  # variance is 1

# # Scaling the data to a specific range

# In[13]:

min_max_scaler = preprocessing.MinMaxScaler(
    feature_range=(-1, 1))  #setting the range
X_train_minmax = min_max_scaler.fit_transform(x_train)
X_train_minmax

# # Binarization

# In[14]:

binarizer = preprocessing.Binarizer().fit(X_train_minmax)  # fit does nothing
binarizer.transform(X_train_minmax)
Ejemplo n.º 27
0
from sklearn.random_projection import GaussianRandomProjection as GRP
from sklearn.mixture import GaussianMixture
import scipy

dataset1 = pd.read_csv("./DATASET/student/student-por.csv")
var_to_encode = [
    'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason',
    'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
    'higher', 'internet', 'romantic'
]
for col in var_to_encode:
    dataset1[col] = LabelEncoder().fit_transform(dataset1[col])
y0 = list(dataset1['G3'])
# Binarize G3<=11: G3=0   G3>11: G3=1
dataset1[['G3'
          ]] = preprocessing.Binarizer(threshold=12).transform(dataset1[['G3'
                                                                         ]])
x1 = dataset1[dataset1.columns.drop('G3')]
y1 = list(dataset1['G3'])
scaler = StandardScaler()
scaler.fit(x1)
x1_n = scaler.transform(x1)
#<-----------------------DATASET1

dataset2 = pd.read_csv("./DATASET/BANK/MT_Train.csv")
dataset2.drop('default', axis=1, inplace=True)
le = LabelEncoder()
var_to_encode = [
    'job', 'marital', 'education', 'day_of_week', 'month', 'housing', 'loan',
    'poutcome'
]
for col in var_to_encode:
Ejemplo n.º 28
0
def binarizer(arr0, threshold):
    matrix = np.array(arr0)
    temp = preprocessing.Binarizer(threshold=threshold).fit_transform(matrix)
    # result = data_utility.retrieve_nan_index(temp.tolist(), index)
    result = temp.tolist()
    return result
import numpy as np
from sklearn import preprocessing

input_data = np.array([[1, 2], [3, 4], [5, 6]])

data_binerizer = preprocessing.Binarizer(threshold=4)
binerized = data_binerizer.transform(input_data)
print(binerized)
Ejemplo n.º 30
0
import numpy as np
import sklearn.preprocessing as sp
import scrapy.middleware as sm
import matplotlib.pyplot as mp

a = np.array([[10, 20, 5], [2, 4, 1], [10, 11, 15]])

bin = sp.Binarizer(threshold=10)
result = bin.transform(a)
print(result)

lily = sm.imread('../素材/da_data/lily.jpg', True)
bin = sp.Binarizer(threshold=127)
result = bin.transform(lily)
mp.imshow(result, cmap='gray')
mp.show()