def convert_row(row, scale):
        """Convert a CCEPC row into numpy.ndarrays.

        :param row:
        :type row: pandas.Series
        :return: tuple of sample ID and the converted data into numpy.ndarrays
        :rtype: tuple
        """
        a = row["A"].split(" ")
        b = row["B"].split(" ")

        if a[0] == "":
            a.pop(0)
            b.pop(0)

        if a[-1] == "":
            a.pop(-1)
            b.pop(-1)

        a = array([float(i) for i in a])
        b = array([float(i) for i in b])
        if scale:
            a = scaler(a)
            b = scaler(b)
        return row['SampleID'], a, b
def outlier_transform(X):
    num_features = list(X.select_dtypes(include=['float64', 'int64']))
    X[num_features] = scaler().fit_transform(X[num_features])
    for var in X.select_dtypes(include=['float64', 'int64']):
        # scaler before 
        X = X[np.abs(X[var] - X[var].mean()) <= (3 * X[var].std())]
    return X
    def preprocess_data(self):

        # Step 1 - One Hot Encode
        self.get_categorical_columns()
        print('Step 2 - Categorical Column Identification Complete ...')

        self.x_train = pd.get_dummies(self.x_train, columns=self.categorical_columns, prefix='one_hot_encoded_')
        self.get_training_columns(self.x_train)
        # Hotfix for XGBoost
        for column in self.traincols:
            if ("<" in column):
                self.x_train.rename(index=str, columns={column: column.replace("<", "")}, inplace=True)
        self.get_training_columns(self.x_train)
        encoded_columns = [i for i in self.traincols if "one_hot_encoded_" in i][:-1]
        not_encoded_columns = [i for i in self.traincols if "one_hot_encoded_" not in i]
        self.x_train = self.x_train[self.union(encoded_columns, not_encoded_columns)]
        self.get_training_columns(self.x_train)
        print('Step 3 - One Hot Encoding Complete ...')

        # Step 2 - Null Value Impute
        imputer = Imputer(strategy='mean', copy=False)
        self.x_train = pd.DataFrame(data=imputer.fit_transform(self.x_train), columns=self.traincols)
        print('Step 4 - Null Value Imputation Complete ...')

        # Step 3 - Feature Scaling
        sc_X = scaler(copy=False)
        self.x_train[not_encoded_columns] = sc_X.fit_transform(self.x_train[not_encoded_columns])
        print('Step 5 - Standardisation Complete ...')

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=1)
        print('Step 5 - Train Test Splitting Complete ...')

        print('Shape:' + str(self.x_train.shape))

        return self.df, self.x_train, self.y_train, self.x_test, self.y_test, self.traincols, self.categorical_columns
def methodANM_IGCI(X, Y):
    answerANM = 0
    answerIGCI = 0
    data = pd.Series({"X": scaler(X), "Y": scaler(Y)})

    m = ANM()
    pred = m.predict(data)
    #print(pred, "(ANM, Value : 1 if X->Y and -1 if Y->X)")
    if (pred > 0):
        answerANM = 1

    m = IGCI()
    pred = m.predict(data)
    #print(pred[0], "(IGCI, Value: >0 if X->Y and <0 if Y->X)")
    if (pred > 0):
        answerIGCI += 1

    return answerANM, answerIGCI
Exemple #5
0
    def dataset(self, a, b, scale=False, shape=(-1, 1)):
        """Produce a PairwiseDataset of two variables out of the data.

        Args:
            a (str): Name of the first variable
            b (str): Name of the second variable
            scale (bool): scale the data with 0 mean and 1 variance.
            shape (tuple): desired shape of `torch.Tensor` of `a` and `b`

        Returns:
            cdt.utils.io.MetaDataset: the new pairwise dataset
        """
        a = self.data[:, self.names[a]]
        b = self.data[:, self.names[b]]
        if scale:
            a = scaler(a)
            b = scaler(b)
        return PairwiseDataset(th.Tensor(a).view(*shape),
                               th.Tensor(b).view(*shape))
Exemple #6
0
def individual_images_to_pca(ind_image):
    img = cv2.resize(ind_image, (64, 64))
    edges = cv2.Canny(img, 64, 64)
    edges = edges.reshape(1, 4096)
    pca = PCA(.95)
    s = scaler()
    x = s.fit_transform(edges)
    lower_dimension = pca.fit_transform(x)
    approximation = pca.inverse_transform(lower_dimension)
    return lower_dimension, approximation
Exemple #7
0
def clean_df(df):
    df = df.fillna("None")
    df = df.replace([""," ","None"],[None,None,None])    
    numeric_columns = ["Bateria","CamaraFrontal","CamaraPosterior","Garantia",
                       "MemoriaInterna","RAM","Price","PuntajeAntutu","PuntajeK"]
    units = ["mAh","Mpx","Mpx","Meses","GB","GB","$","",""]
    df = clean_scale_numeric_columns(df, numeric_columns,units)    
    df["ResistenciaAgua"] = df.apply(lambda row:get_watter_resistance(row["ResistenciaAgua"]),axis=1)
    df["Resolucion"] = df.apply(lambda row:get_resolution(row["Resolucion"]),axis=1)
    df["Resolucion_S"] = scaler(feature_range=(0.1,1)).fit_transform(np.array(df["Resolucion"]).reshape(1,-1).transpose())
    df["Score"] = df.apply(score,axis=1)
    df["CalidadPrecio"]=df.apply(price_quality,axis=1)
    
    return df
Exemple #8
0
def to_pca(x):
    dir = '../Models2/'
    pca = PCA(.95)
    s = scaler()
    x = s.fit_transform(x)
    lower_dimension = pca.fit_transform(x)
    approximation = pca.inverse_transform(lower_dimension)
    dims = pca.n_components_
    joblib.dump(s.scale_, dir + 'scaler')
    with open(dir + 'pca', 'wb') as file:
        pickle.dump(pca, file)
    np.save(dir + 'lower_dimension', lower_dimension)
    np.save(dir + 'approximation', approximation)
    return lower_dimension, approximation
Exemple #9
0
def reshape_data(df_data, list_variables, type_variables):

    list_array = []

    dim_variables = {}

    for var in list_variables:
        if (type_variables[var] == "Categorical"):
            data = df_data[var].values
            data = get_dummies(data).as_matrix()
            data = data.reshape(data.shape[0], data.shape[1])

        elif (type_variables[var] == "Numerical"):
            data = scaler(df_data[var].values)
            data = data.reshape(data.shape[0], 1)

        dim_variables[var] = data.shape[1]

        list_array.append(data)

    return concatenate(list_array, axis=1), dim_variables
    def __init__(self, data, names=None, device=None, scale=True):
        super(MetaDataset, self).__init__()
        if names is not None:
            self.names = names
        else:
            try:
                assert isinstance(data, DataFrame)
            except AssertionError:
                raise TypeError('If names is not specified, \
                data has to be a pandas.DataFrame')
            self.names = OrderedDict([(i, idx)
                                      for idx, i in enumerate(data.columns)])

        if isinstance(data, DataFrame):
            data = data.values

        if scale:
            self.data = th.Tensor(scaler(data))
        else:
            self.data = th.Tensor(data)

        if device is not None:
            self.data = self.data.to(device)
Exemple #11
0
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn import decomposition
#from sklearn import datasets
from sklearn.preprocessing import StandardScaler as scaler
import os
os.chdir(
    "C:/Users/NgocBien/Desktop/MachineLearningProjet/MachineLearning/TPML/TPML"
)
data2 = pd.read_csv('./crime.csv', sep=';')
X2 = data2.ix[:, 1:7].values
labels2 = data2.ix[:, 0].values
pca = decomposition.PCA(n_components=3)
#ces codes en base nous permettent de savoir combien d'infos qu'on garde
#quand on fait de ACP.
X2_norm = scaler().fit_transform(X2)
pca.fit(X2_norm)
print(pca.singular_values_)
print(pca.explained_variance_ratio_)
# On recupere les coordonnees de PCA sur 3 axes et on fait la projection
#sur les 2 premieres axes.
X2_pca = pca.fit_transform(X2)
import matplotlib
plt.scatter(X2_pca[:, 0], X2_pca[:, 1])
for label, x, y in zip(labels2, X2_pca[:, 0], X2_pca[:, 1]):
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(-0.2, 0.2),
                 textcoords='offset points')
plt.show()
Exemple #12
0
def clean_scale_numeric_columns (df,columns,units):
    for column,unit in list(zip(columns,units)):
        df[column]=df.apply(lambda row: clean_numeric_row(row[column],unit),axis=1)
        df["{}_S".format(column)]=scaler(feature_range=(0.1,1)).fit_transform(np.array(df[column]).reshape(1,-1).transpose())
    return df
Exemple #13
0
import  numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importar dataset
df=pd.read_csv("Position_Salaries.csv")

#seleccionar datos para las variables independientes(x) y la y 

x=df.iloc[:, 1:2].values
y=df.iloc[:,-1].values

#Escalado de variables
from sklearn.preprocessing import StandardScaler as scaler
sc_x=scaler()
sc_y=scaler()
x= sc_x.fit_transform(x)
y= sc_y.fit_transform(y.reshape(-1,1))

#------<AJUSTAR MODELOS DE REGRESION>----------

#Ajustar regresion lineal con el dataset 

"""
from sklearn.linear_model import LinearRegression
linearR=LinearRegression()
linearR.fit(x,y)
print(linearR)
"""
Exemple #14
0
def main():
    np.random.seed(42)

    urls = [
        'http://www.ehu.eus/ccwintco/uploads/6/67/Indian_pines_corrected.mat',
        'http://www.ehu.eus/ccwintco/uploads/c/c4/Indian_pines_gt.mat',
    ]
    for url in urls:
        download_dataset(url)

    gt = load_data(DATA / 'Indian_pines_gt.mat')
    plt.imsave(IMG / 'gt.png', gt)

    ipc = load_data(DATA / 'Indian_pines_corrected.mat')

    p111 = scale2int(ipc[..., 111])
    plt.imsave(IMG / '111.png', p111)
    plt.imsave(IMG / '111_canny.png', canny(p111))

    data = get_data(DATA / 'indian_pines.csv', gt, ipc)

    X = data.copy().astype(np.float64)
    y = X.pop('target').astype(int)
    unique_y = len(y.unique())

    X2 = scaler().fit(X).transform(X)

    n_components = 4

    pca = PCA(n_components=n_components).fit(X2, y)
    X_pca = pca.fit_transform(X2)

    fig, ax = plt.subplots(1, 1)
    ax.set_xlabel('Principal Components')
    ax.set_ylabel('Variance Ratio')
    ax.set_title('Variance ratio for PCA on Indian Pines dataset')
    ax.grid()
    ax.set_xticks(range(1, n_components + 1))
    ax.bar(range(1, n_components + 1), pca.explained_variance_ratio_)
    fig.savefig(IMG / 'pca_components.png')

    colorlist = np.random.choice(list(cnames.keys()), unique_y,
                                 replace=False).tolist()

    colors = y.map(lambda x: colorlist[x])

    df = pd.DataFrame(X_pca[:, :2])
    df = pd.concat([df, y, colors], axis=1)
    df.columns = ['PC1', 'PC2', 'target', 'color']

    df_0 = df[df['target'] != 0]

    fig, ax = plt.subplots(1, 1)
    ax.set_xlabel('PC-1')
    ax.set_ylabel('PC-2')
    ax.set_title('PCA on Indian Pines dataset')
    ax.grid()
    ax.scatter(df_0['PC1'], df_0['PC2'], color=df_0['color'], s=3)
    fig.savefig(IMG / 'pc1_pc2.png')

    img = (df['PC1'] + df['PC2']).values.reshape((145, 145))
    plt.imsave(IMG / 'pc12.png', img)

    c = canny(img,
              sigma=2.,
              low_threshold=.15,
              high_threshold=.6,
              use_quantiles=True)
    plt.imsave(IMG / 'pc12_canny.png', c)

    gt2 = cv2.imread((IMG / 'gt.png').as_posix(), 0)
    plt.imsave(IMG / 'gt_canny.png', canny(gt2))
Exemple #15
0
# trX, teX, trY, teY = _read_split(
# 	"../datasets/nd-data/boundary.csv",
# 	read=1,oneHot=0)

#Integrating smote with daego

#perform smote at the intermediate stage of training via stacked denoising encoder

from algorithms.utils import _read_dat
trX, teX, trY, teY = _read_dat("dataset/page-blocks0.dat",
                               skip=15,
                               read=1,
                               oneHot=0)

scaler = scaler()
trX = scaler.fit_transform(trX)
teX = scaler.fit_transform(teX)
from mlxtend.tf_classifier import TfSoftmaxRegression
trY = trY.astype(int)

print trX.shape[1], "Input Feature Space"

print "Enter Layers"
layer = input()
print "Enter the leyer no after smote to be performed"
l_s = int(input())
l_encoder = layer[:l_s]

model_bs = StackedAutoEncoder(
    dims=l_encoder,
# trX, teX, trY, teY = _read_split(
# 	"../datasets/nd-data/boundary.csv",
# 	read=1,oneHot=0)

#Integrating smote with daego

#perform smote at the intermediate stage of training via stacked denoising encoder

from algorithms.utils import _read_dat
trX, teX, trY, teY = _read_dat(
	"dataset/page-blocks0.dat",skip=15,
	read=1,oneHot=0)


scaler=scaler()
trX=scaler.fit_transform(trX)
teX=scaler.fit_transform(teX)
from mlxtend.tf_classifier import TfSoftmaxRegression
trY=trY.astype(int)

print trX.shape[1],"Input Feature Space"

print "Enter Layers"
layer=input()
print "Enter the leyer no after smote to be performed"
l_s=int(input())
l_encoder=layer[:l_s]

model_bs = StackedAutoEncoder(dims=l_encoder, activations=['tanh' for i in range(len(l_encoder))], noise='gaussian', 
	epoch=[10000 for i in range(len(l_encoder))],loss='rmse', 
def min_max_scaling(features_train,features_test):
    from sklearn.preprocessing import MinMaxScaler as scaler    
    features_train = scaler().fit_transform(features_train)  
    features_test = scaler().fit_transform(features_test)
    return features_train, features_test
            bbox_inches='tight')
plt.close("all")

# Make an array of the data to be used for clustering,
# and delete pca_slices, scaled_slices, energy and amplitudes
n_pc = 3
data = np.zeros((len(pca_slices), n_pc + 2))
data[:, 2:] = pca_slices[:, :n_pc]
data[:, 0] = energy[:] / np.max(energy)
data[:, 1] = np.abs(amplitudes) / np.max(np.abs(amplitudes))
data = np.concatenate((data, pca_autocorr[:, :3]), axis=-1)
data = np.concatenate((data, conv_pca_slices), axis=-1)

# Standardize features in the data since they
# occupy very uneven scales
standard_data = scaler().fit_transform(data)

# We can whiten the data and potentially use
# diagonal covariances for the GMM to speed things up
# Not sure how much this step helps
data = pca(whiten='True').fit_transform(standard_data)

del pca_slices
del scaled_slices
del energy
del slices_autocorr, scaled_autocorr, pca_autocorr

# Set a threshold on how many datapoints are used to FIT the gmm
dat_thresh = 10e3
# Run GMM, from 2 to max_clusters
for i in range(max_clusters - 1):
Exemple #19
0
    remainder="passthrough")  # Leave the rest of the columns untouched
X = onehotencoder.fit_transform(X)
X = X[:, 1:]
#dividir dataset en conjunto de entrenamiento y testing

from sklearn.model_selection import train_test_split as splitter

x_train, x_test, y_train, y_test = splitter(X,
                                            y,
                                            test_size=0.2,
                                            random_state=0)

#Escalado de variables
from sklearn.preprocessing import StandardScaler as scaler

scala_x = scaler()
x_train = scala_x.fit_transform(x_train)
x_test = scala_x.transform(x_test)

#------<AJUSTAR MODELOS DE CLASIFICACION>----------
"""
#Ajustar regresion con el el conjunto de entrenamiento

#Crear modelo de clasificación aqui 

from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0)
classifier.fit(x_train,y_train)
print(classifier)

Exemple #20
0
        dict_of_df = {k: pd.DataFrame(v) for k, v in self.result.items()}
        result_df = pd.concat(dict_of_df, axis=1)
        result_df.to_csv(os.path.join(self.path, 'result.csv'))
        with open(os.path.join(self.path, 'grid_search_params'), 'w') as f:
            yaml.dump(dict(self.grid_params), f, default_flow_style=False)


def range_nfo(min_n_samples, n_features, n_points):
    return np.unique(
        np.linspace(2, min(min_n_samples, n_features), n_points,
                    dtype=int)).tolist()


if __name__ == '__main__':

    simple_knn = Pipeline([('scaler', scaler()), ('knn', knn())])
    lmnn_knn = Pipeline([('scaler', scaler()), ('lmnn', lmnn()),
                         ('knn', knn())])
    nca_knn = Pipeline([('scaler', scaler()), ('nca', nca()), ('knn', knn())])
    pca_knn = Pipeline([('scaler', scaler()), ('pca', pca()), ('knn', knn())])

    cfg_file = sys.argv[1]
    with open(cfg_file, 'r') as f:
        config = yaml.load(f)
    datasets = config['datasets']

    gs = GS(n_folds=3, random_state=RANDOM_SEED)

    for dataset_name in datasets:
        print("Benchmarking dataset {}...".format(dataset_name))
        dataset_func = DATASETS[dataset_name]