Esempio n. 1
0
def calculate_district_dis(dis_style = "euclidean"):
    print(dis_style)
    cal_what_dis = calculate_function[dis_style]

    poi_df = pd.read_csv(os.path.join(DATA_DIR, CONCRETE_DIR, POI_SHEET_DIR, "poi_data.csv"))
    # get all the poi data in dataframe
    districts_poi = poi_df.values[:, 1:]


    scaler = MaxAbsScaler()
    scalered_districts_poi = scaler.fit_transform(districts_poi)

    if dis_style == "canberra":
        scalered_districts_poi = districts_poi

    result = OrderedDict()
    for based_d in range(districts_poi.shape[0]):
        result[based_d + 1] = OrderedDict()
        based_district_poi = scalered_districts_poi[based_d]
        for c_d in range(districts_poi.shape[0]):
            compare_district_poi = scalered_districts_poi[c_d]

            result[based_d + 1][c_d + 1] = cal_what_dis(based_district_poi, compare_district_poi)
        result[based_d + 1] = sorted(result[based_d + 1].items(), key=lambda d:d[1])

    return result
Esempio n. 2
0
def scale(df, scaling=None):
    """Scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to scale
    scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    if scaling is None or scaling.lower() == 'none':
        return df

    df = df.dropna(axis=1, how='any')

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = df.as_matrix()
    mat = scaler.fit_transform(mat)
    df = pd.DataFrame(mat, columns=df.columns)

    return df
Esempio n. 3
0
def load_data(shuffle=True, n_cols=None):
    train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv')
    test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv')

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    df_train = df_train.drop('case_id', 1).astype(np.float32)
    df_test = df_test.drop('case_id', 1).astype(np.float32)

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.as_matrix()
    X_test = df_test.as_matrix()

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    return X_train, X_test
Esempio n. 4
0
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
Esempio n. 5
0
def normalize_raw_features(X: np.array) -> np.array:
    """Normalize features if column was not OneHot encoded"""
    for col in range(X.shape[1]):
        dense_col = X[:, col].todense()
        if (dense_col > 1.).any() or (dense_col < 0.).any():
            scaler = MaxAbsScaler().fit(dense_col)
            X[:, col] = csr_matrix(scaler.transform(dense_col))
    return X
 def cluster_user(self):
     user_feature_matrix = self.__extract_user_feature()
     user_feature_matrix = user_feature_matrix.tocsr()
     user_feature_matrix= MaxAbsScaler().fit_transform(user_feature_matrix)
     #model = DBSCAN(eps=0.5, min_samples=100).fit(user_feature_matrix)
     model = MiniBatchKMeans(n_clusters=50,max_iter=10000).fit(user_feature_matrix.toarray())
     labels = model.labels_
     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
     print('Estimated number of clusters: %d' % n_clusters_)
     user_label_dict = dict()
     for user in self.__user_ix_dict:
         user_label_dict[user] = labels[self.__user_ix_dict[user]]
     return user_label_dict
def test_maxabsscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.MaxAbsScaler
    # with sklearn.preprocessing.MaxAbsScaler

    maxabsscalerr = MaxAbsScalerR()
    maxabsscalerr.fit(np.concatenate(trajs))

    maxabsscaler = MaxAbsScaler()
    maxabsscaler.fit(trajs)

    y_ref1 = maxabsscalerr.transform(trajs[0])
    y1 = maxabsscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Esempio n. 8
0
def plotPCA(X_train, y_train, X_test, y_test, outdir):
    #clf = loadClf(term, fold, clfName)
    #try:
    #    decision = clf.decision_function
    #    Vf = numpy.arange(-1.,1.1,0.1)
    #    V = (0.,)
    #except AttributeError:
    #    decision =  lambda x:clf.predict_proba(x)[:,0]
    #    Vf = numpy.arange(0.,1.05,0.05)
    #    V = (0.5,)
    scaler = MaxAbsScaler(copy=False)
    target_names = ("Positive","Negative")
    term = outdir.parent.name.replace("_", " ")
    pca = PCA(n_components=2)
    pca.fit(X_train)
    scaler.fit(pca.transform(X_train))
    #delta = 0.025
    #a=numpy.arange(-1., 1., delta)
    #b=numpy.arange(-1., 1., delta)
    #A,B = numpy.meshgrid(a,b)
    #C=numpy.empty(A.shape)
    for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')):
        X_r = scaler.transform(pca.transform(X))
        inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1)
        #print(X_r)
        plt.clf()

        #for k,l in product(range(len(a)),range(len(b))):
        #    C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),))))
        #print(C)
        #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone)
        #cfp.cmap.set_under('black')
        #cfp.cmap.set_over('white')
        #plt.contour(A,B,C,V,colors=("b",))
        #y=clf.predict(X)
        for c, i, target_name in zip("rg", (0, 1), target_names):
            plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1],
                    c = c,
                    label = target_name,
                    marker = ",",
                    s = 1,#0.8,#1/numpy.sqrt(2),
                    #edgecolors='none',
                    linewidth = 0,
                    alpha = 0.7)
        plt.legend()
        plt.title('PCA for %s on %s data' % (term, n))
        plt.savefig(str(outdir/('pca-%s.png' % (n,))))
        plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
Esempio n. 9
0
def _train_test_split():
    # Build the store_weather dataframe
    store_weather_filename = Config.save_dir + "store_weather.pkl"
    if os.path.exists(store_weather_filename):
        store_weather = utils.from_pickle(store_weather_filename)
    else:
        store_weather = _preprocess_data()

    # Split train test for each store
    train = pd.DataFrame({})
    test = pd.DataFrame({})
    store_ids = store_weather.store_id_bk.unique()
    for sid in store_ids:
        c_store = store_weather[store_weather.store_id_bk == sid]
        s_train = c_store[:-Config.test_size]
        s_test = c_store[-Config.test_size:]
        train = train.append(s_train).reset_index().drop(["index"], axis=1)
        test = test.append(s_test).reset_index().drop(["index"], axis=1)

    # Scale numeric columns
    num_cols = ["p_total_revenue", "p_total_volume", "mean_temp",
                "total_precipitation", "total_snow"]
    scaler = MaxAbsScaler().fit(train.loc[:, num_cols])
    train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols])
    test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols])

    # Scale 2 output columns
    revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]])
    volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]])
    train.loc[:, ["total_revenue"]] = revenue_scale.transform(
        train.loc[:, ["total_revenue"]])
    test.loc[:, ["total_revenue"]] = revenue_scale.transform(
        test.loc[:, ["total_revenue"]])
    train.loc[:, ["total_volume"]] = volume_scale.transform(
        train.loc[:, ["total_volume"]])
    test.loc[:, ["total_volume"]] = volume_scale.transform(
        test.loc[:, ["total_volume"]])

    # Save the train/test dataframes to pickle objects
    utils.to_pickle(Config.save_dir + "train_set.pkl", train)
    utils.to_pickle(Config.save_dir + "test_set.pkl", test)

    # Save the 2 scaler for later use
    utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale)
    utils.to_pickle(Config.save_dir + "volume_scale", volume_scale)

    # Save store_ids
    utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids)

    return train, test
def scale_data(x_train, x_test):

    """
        We only scale the continuous features. No need to scale binary features
    """

    
    idx_binary = [] # columns with boolean values
    for k in range(x_train.shape[1]):
        idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary
    idx_cont = np.logical_not(idx_binary)


    sc = MaxAbsScaler()
    sc.fit(x_train[:, idx_cont])
    
    x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont])
    x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont])

    return
Esempio n. 11
0
def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.
        
    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)
    
    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)
    
    return df
Esempio n. 12
0
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

# Use Scikit-learn's MaxAbsScaler to scale the features
training_features = result1.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0:
    scaler = MaxAbsScaler()
    scaler.fit(training_features.values.astype(np.float64))
    scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64))
    result1 = pd.DataFrame(data=scaled_features)
    result1['class'] = result1['class'].values
else:
    result1 = result1.copy()

# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
Esempio n. 13
0
 # Load dataset:
path = '/media/DATA/tmp/datasets/regionais/meteo_regions/csv_regions/TAG/yearly/'
file = 'yearly_clip_R1_OK_TAG.csv'
df = pd.read_csv(os.path.join(path, file), sep=',', decimal='.')

# Split into input (X) and output (Y) variables:
df2 = df[['36V', '89V', '166V', '190V']]
#x = df2.reindex(columns=cols)
x = df2[['36V', '89V', '166V', '190V']]
y = df[['TagRain']]

# Scaling the input paramaters:

scaler_min_max = MinMaxScaler()
x_minmax = scaler_min_max.fit_transform(x)

scaler_abs_max = MaxAbsScaler()
x_abs_max = scaler_abs_max.fit_transform(x)

stand_sc = StandardScaler()
x_stand_sc = stand_sc.fit_transform(x)

norm_sc = Normalizer()
x_norm = norm_sc.fit_transform(x)

x_power_box = PowerTransformer(method='box-cox').fit_transform(x)
x_power_yeo = PowerTransformer(method='yeo-johnson').fit_transform(x)

x_quantil = QuantileTransformer(output_distribution = 'uniform').fit_transform(x)
# ;    'P17','P18','P19','P20','P21','P22','P23','P24',
# ;    'P25','P26','P27','P28','P29','P30','P31','P32',
# ;    'P33','P34','P35','P36','P37','P38','P39','P40',
# ;    'P41','P42','P43','P44','P45','P46','P47','P48',
# ;    'P49','P50','P51','P52','P53','P54','P55','P56',
# ;    'P57','P58'], header=None)
    print(df_train.head())

    df_train = pd.get_dummies(df_train)
    df_train = df_train.fillna(df_train.mean())

    y = df_train['P29'].values
    X_train = df_train[['P0','P1','P2','P3','P4','P5','P6','P7','P8','P9','P10','P11','P12','P13',\
                        'P14','P15','P16','P17','P18','P19','P20','P21','P22','P23','P24','P25','P26','P27','P28']]


    # y = df_train['P58'].values
    # X_train = df_train[['P0','P1','P2','P3','P4','P5','P6','P7','P8','P9','P10','P11','P12','P13',\
    #                 'P14','P15','P16','P17','P18','P19','P20','P21','P22','P23','P24','P25','P26',
    #                 'P27','P28','P29','P30','P31','P32','P33','P34','P35','P36','P37','P38','P39',
    #                 'P40','P41','P42','P43','P44','P45','P46','P47','P48','P49','P50','P51','P52',
    #                 'P53','P54','P55','P56','P57']]


    X_train = MaxAbsScaler().fit_transform(X_train)
    seed = 30
    np.random.seed(seed)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.064, random_state=seed)

    model = load_network(filename)
    evaluated(model,X_test, y_test)
Esempio n. 15
0
def masked_randomForest(X_train, y_train, preprocess='Std'):

    # PREPROCESSING = FEATURES SCALING
    if preprocess == 'MaxMin':
        preprocessing = MaxAbsScaler()
        preprocessing.fit(X_train)
        X_train = preprocessing.transform(X_train)
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'Binarization':
        preprocessing = Binarizer()
        preprocessing.fit(X_train)
        X_train = preprocessing.transform(X_train)
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'Std':
        preprocessing = StandardScaler(with_mean=False)
        preprocessing.fit(X_train)
        X_train = preprocessing.transform(X_train)
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'full_std':
        preprocessing = StandardScaler()
        X_train = preprocessing.fit_transform(X_train.toarray())
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'norm':
        X_train = normalize(X_train.toarray(), axis=0, norm='l1')
        print 'preprocess %s completed' % (preprocess)

    clf = RandomForestClassifier(n_jobs=-1, n_estimators=50)
    clf.fit(X_train, y_train)
    importances = clf.feature_importances_

    inter = [np.percentile(importances, qq) for qq in np.linspace(0, 100, 6)]
    count = 0
    RF_features = np.zeros((1, len(importances)))
    for low, high in zip(inter[:len(inter)], inter[1:]):
        if low == -1:
            RF_features += count * np.logical_and(importances <= high,
                                                  importances >= low)
        else:
            RF_features += count * np.logical_and(importances <= high,
                                                  importances > low)
        count += 1
        RF_features = RF_features.astype(int)

    importances.sort()
    fig, ax1 = plt.subplots(1)
    x = np.arange(len(importances))
    ax1.plot(x, importances[::-1], 'b-')
    ax1.set_xlabel('features')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Random Forest importance', color='b')
    ax1.tick_params('y', colors='b')
    ax1.set_yscale('log')
    fig.tight_layout()
    plt.title(' RandomForest importance ')
    plt.show()

    return RF_features
Esempio n. 16
0
class Simulation(object):
    """Class glueing all the pieces together. Performs whole simulation.

    :param dataset: Dataset which extends :py:obj:`mutabledataset.SimMixin`
    :param AgentCl: Class defining agent behavior, namely `benefit` and `cost`.
    :param learner: Class defining learner behavior, namely `fit` and `predict`.
    :param split: Defines portion used for fitting the learner. Rest is used for determining `eps` value, regarding the epsilon equilibrium. Simulation is done on the whole dataset.
    :param cost_distribution: Passed on to AgentTransformer.
    :param cost_distribution_dep: Passed on to AgentTransformer.
    :param no_neighbors: Passed on to AgentTransformer.
    :param max_it: Passed on to AgentTransformer.
    :param collect_incentive_data: Passed on to AgentTransformer.
    """
    def __init__(self,
                 dataset,
                 AgentCl,
                 learner,
                 cost_distribution,
                 split=[0.5],
                 collect_incentive_data=False,
                 no_neighbors=60,
                 cost_distribution_dep=None,
                 max_it=130):
        self.dataset = dataset
        self.no_neighbors = no_neighbors
        self.cost_distribution = cost_distribution
        self.max_it = max_it
        self.learner = learner
        self.split = split
        self.AgentCl = AgentCl
        self.collect_incentive_data = collect_incentive_data
        self.cost_distribution_dep = cost_distribution_dep

    def no_classes(self, dataset):
        """
        :param dataset: Some AIF360 dataset
        :returns: Number of distinct labels (classes)
        """
        return len(set(dataset.labels.ravel()))

    def start_simulation(self, runs=1, scale=True):
        """
        :param runs: Run simulation multiple times with the same parameters
        :param scale: Perform scaling on dataset features.
        :returns: Modified dataset including new ground truth labels
        :rtype: :py:obj:`simulation.SimulationResultSet`
        """
        res_list = []
        for i in range(runs):
            res_list.append(self._simulate(scale))
        return SimulationResultSet(res_list, runs=runs)

    def _simulate(self, scale):
        """
        Private entrypoint to perform a single simulation

        :param scale: Perform scaling on dataset features
        :returns: Modified dataset including new ground truth labels
        :rtype: :py:obj:`simulation.SimulationResult`
        """
        self.scaler = MaxAbsScaler()
        dataset = self.dataset.copy(deepcopy=True)
        # we need at least one example for each class in each of the two splits
        while True:
            train, test = dataset.split(self.split, shuffle=False)
            break
            if self.no_classes(train) >= 2 and self.no_classes(test) >= 2:
                break
        train_indices = list(map(int, train.instance_names))
        test_indices = list(map(int, test.instance_names))

        self.train, self.test = train, test
        if scale:
            train.features = self.scaler.fit_transform(train.features)
            test.features = self.scaler.transform(test.features)
            dataset.features = self.scaler.transform(dataset.features)

        dataset.infer_domain()

        # learner moves
        self.learner.fit(train)

        ft_names = dataset.protected_attribute_names
        ft_indices = list(
            map(lambda x: not x in ft_names, dataset.feature_names))

        self.Y_predicted = self.learner.predict(dataset.features)
        self.Y_predicted_pr = self.learner.predict_proba(dataset.features)

        # agents move
        at = AgentTransformer(
            self.AgentCl,
            self.learner,
            self.cost_distribution,
            collect_incentive_data=self.collect_incentive_data,
            no_neighbors=self.no_neighbors,
            cost_distribution_dep=self.cost_distribution_dep,
            max_it=self.max_it)

        dataset_ = at.transform(dataset)

        train_ = utils.dataset_from_matrix(
            np.hstack((dataset_.features[train_indices, :],
                       dataset_.labels[train_indices])), dataset)
        test_ = utils.dataset_from_matrix(
            np.hstack((dataset_.features[test_indices, :],
                       dataset_.labels[test_indices])), dataset)

        acc_h = self.learner.accuracy(test)

        # update changed features

        #dataset_ = dataset_from_matrix(np.hstack((np.vstack((train_.features, test_.features)), np.vstack((train_.labels, test_.labels)))), dataset)
        self.Y_new_predicted = self.learner.predict(dataset_.features)
        self.Y_new_predicted_pr = self.learner.predict_proba(dataset_.features)

        acc_h_post = self.learner.accuracy(test_)

        # fit data again, see if accuracy changes
        self.learner.fit(train_)
        acc_h_star_post = self.learner.accuracy(test_)

        # construct datasets for features
        # including predicted label
        if scale:
            dataset.features = self.scaler.inverse_transform(dataset.features)
        dataset_df = dataset.convert_to_dataframe(de_dummy_code=True)[0]
        dataset_df['credit_h'] = pd.Series(self.Y_predicted,
                                           index=dataset_df.index)
        dataset_df['credit_h_pr'] = pd.Series(self.Y_predicted_pr,
                                              index=dataset_df.index)
        if scale:
            dataset_.features = self.scaler.inverse_transform(
                dataset_.features)
        dataset_new_df = dataset_.convert_to_dataframe(de_dummy_code=True)[0]
        dataset_new_df['credit_h'] = pd.Series(self.Y_new_predicted,
                                               index=dataset_new_df.index)
        dataset_new_df['credit_h_pr'] = pd.Series(self.Y_new_predicted_pr,
                                                  index=dataset_new_df.index)

        res = SimulationResult()
        res.df = dataset_df
        res.df_new = dataset_new_df
        res.eps = abs(acc_h_star_post - acc_h_post)
        res.acc_h = acc_h
        res.acc_h_post = acc_h_post
        res.acc_h_star_post = acc_h_star_post
        res.incentives = at.incentives
        return res
Esempio n. 17
0
import numpy as np
import scipy.sparse as sp
from keras import backend as K
from sklearn.metrics import roc_auc_score as auc_score
from sklearn.metrics import average_precision_score as ap_score
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from utils import generate_data, batch_data, compute_masked_accuracy
from utils_gcn import load_citation_data, split_citation_data
from ae_LPNC import autoencoder_multitask

# 以'citeseer'数据集为例,讨论多任务过程
dataset = 'citeseer'
print('\nLoading dataset {:s}...\n'.format(dataset))
adj, feats, y_train, y_val, y_test, mask_train, mask_val, mask_test = load_citation_data(
    dataset)
feats = MaxAbsScaler().fit_transform(feats).tolil()
train = adj.copy()

test_inds = split_citation_data(adj)
test_inds = np.vstack({tuple(row) for row in test_inds})
test_r = test_inds[:, 0]
test_c = test_inds[:, 1]
labels = []
labels.extend(np.squeeze(adj[test_r, test_c].toarray()))
labels.extend(np.squeeze(adj[test_c, test_r].toarray()))

multitask = True
if multitask:
    # If multitask, simultaneously perform link prediction and
    # semi-supervised node classification on incomplete graph with
    # 10% held-out positive links and same number of negative links.
Esempio n. 18
0
def getviz_kpca(X, y, figPath, fig_prefix='KPCA_viz'):

    # this is a non optimized visualization ! Just for thoughts
    preprocessing = MaxAbsScaler()
    X_train = preprocessing.fit_transform(X)
    print 'preprocessing MaxAbs done'

    os.chdir(figPath)

    reds = y == 0
    blues = y == 1
    kernels = ['cosine', 'rbf', 'regular']
    gammas = [1e-4, 1e-3, 1e-2]

    for k in kernels:
        if k == 'rbf':
            for g in gammas:
                plt.figure()
                kpca = KernelPCA(kernel=k, gamma=g, n_components=2, n_jobs=-1)
                X_kpca = kpca.fit_transform(X_train)
                plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-')
                plt.plot(X_kpca[blues, 0],
                         X_kpca[blues, 1],
                         "bo",
                         label='csp+')
                plt.title("Projection by PCA with %s kernel, gamma = %f" %
                          (k, g))
                plt.xlabel("1st principal component")
                plt.ylabel("2nd component")
                plt.legend(loc="lower right", prop={'size': 6})
                plt.savefig('img/' + fig_prefix + k + 'gamma_' + str(g))
            print 'rbf PCA done'

        elif k == 'regular':
            plt.figure()
            kpca = PCA()
            X_kpca = kpca.fit_transform(X_train)
            plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-')
            plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+')
            plt.title("Projection by PCA")
            plt.xlabel("1st principal component")
            plt.ylabel("2nd component")
            plt.legend(loc="lower right", prop={'size': 6})
            plt.savefig('img/' + fig_prefix + k)

            plt.figure()
            plt.plot(kpca.explained_variance_, linewidth=2)
            plt.xlabel('n_components')
            plt.ylabel('explained_variance_')
            plt.title("Projection by PCA")
            plt.savefig('img/' + fig_prefix + k + 'explained_variance')

            print 'PCA done'

        elif k == 'cosine':
            plt.figure()
            kpca = KernelPCA(kernel=k, n_components=2, n_jobs=-1)
            X_kpca = kpca.fit_transform(X_train)
            plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-')
            plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+')
            plt.title("Projection by PCA with %s kernel" % (k))
            plt.xlabel("1st principal component")
            plt.ylabel("2nd component")
            plt.legend(loc="lower right", prop={'size': 6})
            plt.savefig('img/' + fig_prefix + k)

            print 'consine PCA done'
Esempio n. 19
0
def find_best_solution(train_filename):
    unscaled_dist = extract_dist(train_filename)
    # unscaled_dist.info()

    unscaled_dist = clean_distribution(unscaled_dist)
    # unscaled_dist.info()
    # standardised_csv.to_csv('scaledData.csv', index=False)

    preprocessors = {
        'StandardScaler':
        StandardScaler(),
        # The below remove anomalies
        'RobustScaler':
        RobustScaler(),
        'PowerTransformer(method="yeo-johnson")':
        PowerTransformer(method='yeo-johnson'),
        'QuantileTransformer(output_distribution="normal")':
        QuantileTransformer(output_distribution="normal"),
        'QuantileTransformer(output_distribution="uniform")':
        QuantileTransformer(output_distribution="uniform"),
        'MinMaxScaler':
        MinMaxScaler(),
        'MaxAbsScaler':
        MaxAbsScaler(),
        'Normalizer':
        Normalizer(),
    }
    distributions = {
        'unscaled':
        unscaled_dist,
        'StandardScaler':
        scale_distribution(unscaled_dist, StandardScaler()),
        'RobustScaler':
        scale_distribution(unscaled_dist, RobustScaler()),
        'PowerTransformer(method="yeo-johnson")':
        scale_distribution(unscaled_dist,
                           PowerTransformer(method='yeo-johnson')),
        'QuantileTransformer(output_distribution="normal")':
        scale_distribution(unscaled_dist,
                           QuantileTransformer(output_distribution="normal")),
        'QuantileTransformer(output_distribution="uniform")':
        scale_distribution(unscaled_dist,
                           QuantileTransformer(output_distribution="uniform")),
        'MinMaxScaler':
        scale_distribution(unscaled_dist, MinMaxScaler()),
        'MaxAbsScaler':
        scale_distribution(unscaled_dist, MaxAbsScaler()),
        'Normalizer':
        scale_distribution(unscaled_dist, Normalizer()),
    }

    classifiers = {
        # Standard classifiers
        'KNeighborsClassifier': {
            'configurable':
            lambda configuration: KNeighborsClassifier(n_neighbors=
                                                       configuration),
            'score_function':
            get_classifier_score_knn,
        },
        'LinearSVC': {
            'configurable':
            lambda configuration: LinearSVC(max_iter=configuration),
            'score_function': get_classifier_score_linear_svc,
        },
        'LogisticRegression': {
            'configurable':
            lambda configuration: LogisticRegression(solver=configuration),
            'score_function':
            get_classifier_score_logistic_regression,
        },
        # DL
        # 'Sequential': {
        #     # 'configurable': # TODO get this working
        #     'score_function': get_model_score_sequential,
        # },
    }

    test_sizes = [0.2, 0.25, 0.3]
    # test_sizes = [0.2]

    results_of_all_configurations = []

    log(blue(f'\n############# Trying Configurations #############'))
    for dist_name, dist in distributions.items():
        log(bold(f'\n{dist_name}'))
        X, y = get_x_matrix_and_y_vector(
            dist
        )  # set up X matrix and y vectors for the test and training sets
        for test_size in test_sizes:
            log(f'    test_size = {test_size}')
            train_test_data = (X, y, test_size)
            for classifier_name, classifier in classifiers.items():
                log(f'        {classifier_name}')
                configuration_results, classifier_average_score = try_configuration(
                    train_test_data, classifier)
                for result in configuration_results:
                    result['preprocessor_name'] = dist_name
                    result['test_size'] = test_size
                    result['classifier_name'] = classifier_name
                results_of_all_configurations.extend(configuration_results)

    results_of_all_configurations.sort(
        key=lambda result: result['configuration_score'],
        reverse=True,
    )
    logTopConfigurations(results_of_all_configurations, 10)

    return extract_best_solution(preprocessors, classifiers,
                                 results_of_all_configurations)
Esempio n. 20
0
class ParallelCoordinates(DataVisualizer):
    """
    Parallel coordinates displays each feature as a vertical axis spaced
    evenly along the horizontal, and each instance as a line drawn between
    each individual axis.

    Parameters
    ----------

    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    features : list, default: None
        a list of feature names to use
        If a DataFrame is passed to fit and features is None, feature
        names are selected as the columns of the DataFrame.

    classes : list, default: None
        a list of class names for the legend
        If classes is None and a y value is passed to fit then the classes
        are selected from the target vector.

    normalize : string or None, default: None
        specifies which normalization method to use, if any
        Current supported options are 'minmax', 'maxabs', 'standard', 'l1',
        and 'l2'.

    sample : float or int, default: 1.0
        specifies how many examples to display from the data
        If int, specifies the maximum number of samples to display.
        If float, specifies a fraction between 0 and 1 to display.

    color : list or tuple, default: None
        optional list or tuple of colors to colorize lines
        Use either color to colorize the lines on a per class basis or
        colormap to color them on a continuous scale.

    colormap : string or cmap, default: None
        optional string or matplotlib cmap to colorize lines
        Use either color to colorize the lines on a per class basis or
        colormap to color them on a continuous scale.

    vlines : boolean, default: True
        flag to determine vertical line display

    vlines_kwds : dict, default: None
        options to style or display the vertical lines, default: None

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Examples
    --------

    >>> visualizer = ParallelCoordinates()
    >>> visualizer.fit(X, y)
    >>> visualizer.transform(X)
    >>> visualizer.poof()

    Notes
    -----

    These parameters can be influenced later on in the visualization
    process, but can and should be set as early as possible.
    """

    normalizers = {
        'minmax': MinMaxScaler(),
        'maxabs': MaxAbsScaler(),
        'standard': StandardScaler(),
        'l1': Normalizer('l1'),
        'l2': Normalizer('l2'),
    }

    def __init__(self, ax=None, features=None, classes=None, normalize=None,
                 sample=1.0, color=None, colormap=None, vlines=True,
                 vlines_kwds=None, **kwargs):
        super(ParallelCoordinates, self).__init__(
            ax, features, classes, color, colormap, **kwargs
        )

        # Validate 'normalize' argument
        if normalize in self.normalizers or normalize is None:
            self.normalize = normalize
        else:
            raise YellowbrickValueError(
                "'{}' is an unrecognized normalization method"
                .format(normalize)
            )

        # Validate 'sample' argument
        if isinstance(sample, int):
            if sample < 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `int` must be greater than 1"
                )
        elif isinstance(sample, float):
            if sample <= 0 or sample > 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `float` must be between 0 and 1"
                )
        else:
            raise YellowbrickTypeError(
                "`sample` parameter must be int or float"
            )
        self.sample = sample

        # Visual Parameters
        self.show_vlines = vlines
        self.vlines_kwds = vlines_kwds or {
            'linewidth': 1, 'color': 'black'
        }

    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the parallel
        coordinates canvas and draws each instance and vertical lines on it.
        """
        # Convert from dataframe
        if is_dataframe(X):
            X = X.as_matrix()

        # Choose a subset of samples
        # TODO: allow selection of a random subset of samples instead of head

        if isinstance(self.sample, int):
            self.n_samples = min([self.sample, len(X)])
        elif isinstance(self.sample, float):
            self.n_samples = int(len(X) * self.sample)
        X = X[:self.n_samples, :]

        # Normalize
        if self.normalize is not None:
            X = self.normalizers[self.normalize].fit_transform(X)

        # Get the shape of the data
        nrows, ncols = X.shape

        # Create the xticks for each column
        # TODO: Allow the user to specify this feature
        x = list(range(ncols))

        # Create the colors
        # TODO: Allow both colormap, listed colors, and palette definition
        # TODO: Make this an independent function or property for override!
        color_values = resolve_colors(
            n_colors=len(self.classes_), colormap=self.colormap, colors=self.color
        )
        colors = dict(zip(self.classes_, color_values))

        # Track which labels are already in the legend
        used_legends = set([])

        # TODO: Make this function compatible with DataFrames!
        # TODO: Make an independent function to allow addition of instances!
        for idx, row in enumerate(X):
            # TODO: How to map classmap to labels?
            label = y[idx] # Get the label for the row
            label = self.classes_[label]

            if label not in used_legends:
                used_legends.add(label)
                self.ax.plot(x, row, color=colors[label], alpha=0.25, label=label, **kwargs)
            else:
                self.ax.plot(x, row, color=colors[label], alpha=0.25, **kwargs)

        # Add the vertical lines
        # TODO: Make an independent function for override!
        if self.show_vlines:
            for idx in x:
                self.ax.axvline(idx, **self.vlines_kwds)

        # Set the limits
        self.ax.set_xticks(x)
        self.ax.set_xticklabels(self.features_)
        self.ax.set_xlim(x[0], x[-1])

    def finalize(self, **kwargs):
        """
        Finalize executes any subclass-specific axes finalization steps.
        The user calls poof and poof calls finalize.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        """
        # Set the title
        self.set_title(
            'Parallel Coordinates for {} Features'.format(len(self.features_))
        )

        # Set the legend and the grid
        self.ax.legend(loc='best')
        self.ax.grid()
Esempio n. 21
0
article = df.loc['Cristiano Ronaldo']
# Compute the dot products: similarities
similarities = df.dot(article)
# Display those with the largest cosine similarity
print(similarities.nlargest())

# ===================== #
# ==== Ejercicio 30 === #
# ===================== #

# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline
# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()
# Create an NMF model: nmf
nmf = NMF(n_components = 20)
# Create a Normalizer: normalizer
normalizer = Normalizer()
# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)
# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists)

# ===================== #
# ==== Ejercicio 31 === #
# ===================== #

# Import pandas
import pandas as pd
Esempio n. 22
0
#Import Libraries
from sklearn.datasets import make_regression
from sklearn.preprocessing import MaxAbsScaler
#----------------------------------------------------
'''
work on columns

class sklearn.preprocessing.MaxAbsScaler(copy=True)


'''
# ----------------------------------------------------
# MaxAbsScaler Data

X, y = make_regression(n_samples=500, n_features=3, shuffle=True)
X = X * 100
# showing data
print('X \n', X[:5])
print('y \n', y[:5])

scaler = MaxAbsScaler(copy=True)
X = scaler.fit_transform(X)

#showing data
print('X \n', X[:5])
print('y \n', y[:5])
Esempio n. 23
0
# %% [code]
train_data.info()

# %% [code]
train_data.hist(bins=10)

# %% [code]
from sklearn.preprocessing import MaxAbsScaler, RobustScaler, MinMaxScaler

cols = ["Fare"]
X = train_data.drop(["Survived"], axis=1)

fare_scaled_r = RobustScaler().fit(X[cols])
X[cols] = fare_scaled_r.transform(X[cols])

fare_scaled_ma = MaxAbsScaler().fit(X[cols])
#X[cols] = fare_scaled_ma.transform(X[cols])

# %% [code]
X.head()

# %% [code]
y = train_data["Survived"]

# %% [code]
X.shape, y.shape

# %% [code]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85)
Esempio n. 24
0
weather[weather["maxWd"].isnull() == True]

weather["maxWd"].fillna(method='bfill', limit=1, inplace=True)

std_weather = pd.DataFrame(StandardScaler().fit_transform(
    weather.drop(columns="tm")),
                           columns=weather.drop(columns="tm").columns)
std_weather["tm"] = weather["tm"]

mm_weather = pd.DataFrame(MinMaxScaler().fit_transform(
    weather.drop(columns="tm")),
                          columns=weather.drop(columns="tm").columns)
mm_weather["tm"] = weather["tm"]

ma_weather = pd.DataFrame(MaxAbsScaler().fit_transform(
    weather.drop(columns="tm")),
                          columns=weather.drop(columns="tm").columns)
ma_weather["tm"] = weather["tm"]

rb_weather = pd.DataFrame(RobustScaler().fit_transform(
    weather.drop(columns="tm")),
                          columns=weather.drop(columns="tm").columns)
rb_weather["tm"] = weather["tm"]

feature_name = [
    "avgTa", "sumRn", "avgWs", "maxWd", "avgTd", "minRhm", "sumGsr", "avgTs"
]


def makeDecisionTree(citerion, x_train, y_train, depthNum):
    if depthNum > 0:
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import SelectFwe, SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline

dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
Esempio n. 26
0
def hyperParamSearch(X_train,
                     y_train,
                     X_test,
                     y_test,
                     clf="logistic",
                     scoring='accuracy',
                     preprocess='MaxMin'):
    tuned_parameters = dict()
    if preprocess == 'MaxMin':
        preprocessing = ('MaxMin', MaxAbsScaler())
    if preprocess == 'Binarization':
        preprocessing = ('Bin', Binarizer())

    if clf == "logistic":
        #Parameters of pipelines can be set using ‘__’ separated parameter names:
        tuned_parameters = [{
            'logistic__penalty': ['l1', 'l2'],
            'logistic__C':
            [0.000001, 0.00001, 0.0001, 0.005, 0.001, 0.05, 0.01],
            'logistic__class_weight': [None, 'balanced']
        }]
        pipe = Pipeline(
            steps=[preprocessing, ('logistic', LogisticRegression(n_jobs=-1))])
    if clf == "randomForest":
        tuned_parameters = [{
            'randomForest__n_estimators': [100, 300, 500],
            'randomForest__min_samples_leaf': [1, 2, 5, 10, 25, 50],
            'randomForest__class_weight': [None, 'balanced']
        }]
        pipe = Pipeline(steps=[
            preprocessing, ('randomForest', RandomForestClassifier(n_jobs=-1))
        ])
    if clf == "KNN":
        tuned_parameters = [{
            'KNN__n_neighbors': [5, 10, 20, 40],
            'KNN__weights': ['distance', 'uniform'],
            'KNN__metric': ['euclidean', 'manhattan']
        }]
        pipe = Pipeline(
            steps=[preprocessing, ('KNN', KNeighborsClassifier(n_jobs=-1))])
    for score in scoring:
        estimator = GridSearchCV(pipe,
                                 tuned_parameters,
                                 cv=3,
                                 scoring=score,
                                 error_score=-1,
                                 n_jobs=-1)
        estimator.fit(X_train, y_train)
        save_name = "final_%s(%s based_%s preprocessed).pkl" % (clf, score,
                                                                preprocess)
        # print information
        print("INFO: %s model (preprocessed by %s crossvalid based on %s)" %
              (clf, preprocess, score))
        print("Best parameters set found on development set:")
        print(estimator.best_params_)

        print("%s scores on development set:" % (score))
        means = estimator.cv_results_['mean_test_score']
        stds = estimator.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     estimator.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, estimator.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
        joblib.dump(estimator, save_name, compress=True)
Esempio n. 27
0
def hyperParamSearch_SMOTE(X_train,
                           y_train,
                           X_test,
                           y_test,
                           clf="logistic",
                           scoring='accuracy',
                           preprocess='MaxMin',
                           method='agglo_custom'):
    sm = SMOTE(random_state=1, n_jobs=-1)
    X, y_train = sm.fit_sample(X_train.toarray(), y_train)
    X_train = csr_matrix(X)
    tuned_parameters = dict()
    if preprocess == 'MaxMin':
        preprocessing = ('MaxMin', MaxAbsScaler())
    if preprocess == 'Binarization':
        preprocessing = ('Bin', Binarizer())

    if clf == "logistic":
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        tuned_parameters = [{
            'logistic__penalty': ['l1', 'l2'],
            'logistic__C': [0.0001, 0.001, 0.1, 1, 10],
            'logistic__class_weight': [None, 'balanced']
        }]
        if method == 'agglo_custom':
            tuned_parameters[0]['featuresaggregationscore__clusters'] = [
                50, 100, 200, 500
            ]
            pipe = Pipeline(steps=[
                preprocessing,
                ('featuresaggregationscore', FeaturesAggregationScore()
                 ), ('logistic', LogisticRegression(n_jobs=-1))
            ])
        elif method == 'reduce_dim':
            tuned_parameters[0]['kernelpca__n_components'] = [
                50, 100, 200, 500
            ]
            pipe = Pipeline(steps=[
                preprocessing,
                ('kernelpca', KernelPCA(kernel='cosine', n_jobs=-1)
                 ), ('logistic', LogisticRegression(n_jobs=-1))
            ])
        elif method == 'feat_select':
            fselect = SelectPercentile(chi2)
            tuned_parameters[0]['fselect__percentile'] = [20, 40, 60, 80]
            pipe = Pipeline(steps=[
                preprocessing, (
                    'fselect',
                    fselect), ('logistic', LogisticRegression(n_jobs=-1))
            ])

    if clf == "randomForest":
        tuned_parameters = [{
            'randomForest__n_estimators': [100, 500],
            'randomForest__min_samples_leaf': [1, 10, 25],
            'randomForest__class_weight': [None, 'balanced']
        }]
        if method == 'agglo_custom':
            tuned_parameters[0]['featuresaggregationscore__clusters'] = [
                50, 100, 200, 500
            ]
            pipe = Pipeline(steps=[
                preprocessing,
                ('featuresaggregationscore', FeaturesAggregationScore()
                 ), ('randomForest', RandomForestClassifier(n_jobs=-1))
            ])
        elif method == 'reduce_dim':
            tuned_parameters[0]['kernelpca__n_components'] = [
                50, 100, 200, 500
            ]
            pipe = Pipeline(steps=[
                preprocessing,
                ('kernelpca', KernelPCA(kernel='cosine', n_jobs=-1)
                 ), ('randomForest', RandomForestClassifier(n_jobs=-1))
            ])
        elif method == 'feat_select':
            fselect = SelectPercentile(chi2)
            tuned_parameters[0]['fselect__percentile'] = [20, 40, 60, 80]
            pipe = Pipeline(steps=[
                preprocessing, ('fselect', fselect),
                ('randomForest', RandomForestClassifier(n_jobs=-1))
            ])

    for score in scoring:
        estimator = GridSearchCV(pipe,
                                 tuned_parameters,
                                 cv=3,
                                 scoring=score,
                                 error_score=-1,
                                 n_jobs=-1)
        estimator.fit(X_train, y_train)
        save_name = "final_%s(%s based_%s preprocessed).pkl" % (clf, score,
                                                                preprocess)
        # print information
        print("INFO: %s model (preprocessed by %s crossvalid based on %s)" %
              (clf, preprocess, score))
        print("Best parameters set found on development set:")
        print(estimator.best_params_)

        print("%s scores on development set:" % (score))
        means = estimator.cv_results_['mean_test_score']
        stds = estimator.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     estimator.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, estimator.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
        joblib.dump(estimator, save_name, compress=True)
Esempio n. 28
0
        input_scaler.fit(X)
        # transform training dataset
        X = input_scaler.transform(X)
    if output_scaler is not None:
        # reshape 1d arrays to 2d arrays
        y = y  #.reshape(len(y), 1)
        # fit scaler on training dataset
        output_scaler.fit(y)
        # transform training dataset
        y = output_scaler.transform(y)
    return X, y


ss = StandardScaler()
mm = MinMaxScaler()
mas = MaxAbsScaler()
rs = RobustScaler(quantile_range=(25, 75))
pt1 = PowerTransformer(method="yeo-johnson")
pt1 = PowerTransformer(method="box-cox")
qt1 = QuantileTransformer(output_distribution="uniform")
qt2 = QuantileTransformer(output_distribution="normal")
n = Normalizer()

X_none, y_none = get_dataset(None, None)
X_ss, y_ss = get_dataset(ss, ss)
X_mm, y_mm = get_dataset(mm, mm)
X_mas, y_mas = get_dataset(mas, mas)
X_rs, y_rs = get_dataset(rs, rs)
X_pt1, y_pt1 = get_dataset(pt1, pt1)
X_pt2, y_pt2 = get_dataset(pt2, pt2)
X_qt1, y_qt1 = get_dataset(qt1, qt1)
Esempio n. 29
0
# (9584, 22144)

# ------------------------------------------------------------------------
# Normalize
# generator 마지막에 activation이 tanh.
# tanh을 거친 output 값이 -1~1 사이로 나오기 때문에 최대 1 최소 -1 로 맞춰줘야 한다.

print(np.max(f_ds), np.min(f_ds))
# 3.8146973e-06 -80.0

from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler
scaler1 = StandardScaler()
scaler1.fit(f_ds)
f_ds = scaler1.transform(f_ds)

scaler2 = MaxAbsScaler()
scaler2.fit(f_ds)
f_ds = scaler2.transform(f_ds)

# 이 값이 -1 ~ 1 사아에 있는지 확인
print(np.max(f_ds), np.min(f_ds))
# 1.0 -1.0
# 비슷하게 맞춰 줌!

# 기록용---------------------------------
# MaxAbsScaler
# 4.7683717e-08 -1.0

# MinMaxScaler
# 1.000001 0.0
Esempio n. 30
0
def build_model():
    """
    Parameters
    ----------

    None
    
    Returns
    -------
    
    grid : model (GridSearchCV object)
        GridSearchCV object fitted on the training dataset using MaxAbsScaler and Ridge regression.
    
    X_test : ndarray
        Numpy array holding the feature matrix for the test set.
    
    y_test : ndarray
        Numpy array holding values for response variable for the test set.
        
    Notes
    -----   
    Creates and builds a machine learning model to predict the monthly rent of an apartment using only features that apply to pricing 
    of an apartment that is not currently rented. 
    
    I. Assumptions:
    1. Market doesn't increase so the rent for a new tenant is the same as for the current tenant.
    2. The features selected does not include current tenants/ occupant details, expenditures, out of pocket rents etc.
    3. We have not included most of the continuous variables as they contain details related to occupied units and not vacant ones
    4. Recode and Flag variables have not been considered
    
    II. Feature selection and generation

    The data consists of 15342 rows and 197 columns. The columns are a mix of categorical and continuous variables.
    Not all of them influence the rent. After careful analysis, only 91 columns - 87 categorical and 3 continuous were chosen, which are 
    expected to influence the rent of a vacant apartment. The complete list of included variables is given in the .xls file.
    
    For all categorical variables, one hot encoding is performed. Missing/NA values are not imputed as their information is held by creating spearate binary columns for 
    each of the values. 
    
    'uf17' is chosen as the response variable y.
    
    All rows for which response variable y is missing or above topcode value, are dropped. The final dataset contains 10138 rows and 430 columns. 
    
    No imputation is necessary since the selected rows have no missing/NA values for the continuous variables.
    
    The dataset is then split into X(feature matrix) and y(response variable) and is split into a training and testing set in a 80:20 ratio.
    
    
    III. Model generation and selection
    
    
    Various linear models were tried on the dataset including Linear Regression, KNNRegressor, Lasso regression, Ridge regression, Elastic Net.
    Out of these Ridge regression gave the highest accuracy of 59.26% and was chosen to model this data.
    
    For training and modeling, pipelining was used to first scale the data using MaxAbsScaler. MaxAbsScaler was chosen since the data with its large number of binary columns has a large number of zero values and is sparse.
    GridSearchCV was then used to perform cross validation with 5 folds to determine the best alpha value for Ridge regression. The model with selected alpha value was then fit on the training dataset.
    
    """

    df = pd.read_csv('data.csv')

    non_cat_index = [
        1, 32, 35, 36, 52, 54, 56, 58, 62, 72, 73, 82, 84, 85, 87, 89, 91, 92,
        99, 141, 142, 143, 144, 145, 147, 149, 151, 153, 155, 157, 159, 161,
        164, 165, 168, 169, 170
    ]
    keep = list(range(2, 31)) + [40, 41, 45, 46, 47, 61, 63, 64] + list(
        range(66, 82)) + [83, 86, 88, 90] + list(range(92, 99)) + list(
            range(100, 116)) + [
                118, 126, 127, 128, 129, 130, 137, 138, 139, 140, 163
            ]

    col_names = list(df.columns.values)

    cat_names = []

    for i in range(0, 197):
        if (i + 1) not in non_cat_index:
            cat_names.append(col_names[i])

    keep_index = []

    for i in keep:
        keep_index.append(i - 1)

    keep_df = df[keep_index]

    cat_names_new = []
    for name in cat_names:
        if name in list(keep_df.columns.values):
            cat_names_new.append(name)

    keep_df_exp = pd.get_dummies(keep_df, columns=cat_names_new)

    non_cat_names_in_keep_df = []

    for name in list(keep_df.columns.values):
        if name not in cat_names_new:
            non_cat_names_in_keep_df.append(name)

    # removing all rows where rent is not applicable or given

    keep_df_exp['uf17'].replace([99999], [np.NaN], inplace=True)
    keep_df_exp['uf17'].replace([7999], [np.NaN], inplace=True)
    keep_df_exp_new = keep_df_exp[keep_df_exp.uf17.notnull()]

    #keep_df_exp_new is the expanded chosen columns after dropping all non applicable rent rows

    filter_df = keep_df_exp_new

    X, y = filter_df.loc[:, filter_df.columns != 'uf17'], filter_df.loc[:,
                                                                        'uf17']

    X = X.as_matrix()
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    ridge_pipe = make_pipeline(MaxAbsScaler(), Ridge())
    param_grid = {'ridge__alpha': np.logspace(.1, 1, 10)}
    grid = GridSearchCV(ridge_pipe, param_grid, cv=5)
    grid.fit(X_train, y_train)

    return grid, X_test, y_test
Esempio n. 31
0
    def dataset_segment(self):  # 再写一个文件,存储slidWin的存储信息,比如每次训练集的长度
        temp = None
        scaler_list = np.zeros((8, 1))  # 存放8通道缩放因子的矩阵
        plt.figure()
        for i in range(8):
            U_T = np.loadtxt(
                os.path.join(self.target_addr, 'PCA_Martix',
                             'PCAM_Ch%d.out' % i))  # 加载主成分矩阵  20行3列
            X = np.loadtxt(
                os.path.join(self.target_addr, 'slidWin_Data',
                             'slidWin_Ch%d.out' % i))  # 加载主成分矩阵  20行3列
            pca_data = np.dot(X, U_T)
            X_ = np.loadtxt(
                os.path.join(self.target_addr, 'PCA_Data',
                             'X_%d.out' % i))  # 加载主成分矩阵  20行3列
            # pca_r2 = r2_score(X[6500:8000, 0], X_[6500:8000, 0])
            # print("The r2 value of Ch%d is  %f" % (i, pca_r2))

            # plt.subplot(8, 1, i+1)
            # plt.plot(X[:, 0])
            # plt.plot(X_[:, 0], '--')
            """
            数据归一化,同一通道的所有主元采用第一主元的缩放因子        
            """
            scaler = MaxAbsScaler()  # 数据标准化
            scaler.fit(pca_data[:,
                                0].reshape(-1,
                                           1))  # 仅取第一主元的缩放因子,所有成分都用第一主元的缩放因子
            scaler_list[i, 0] = scaler.scale_
            pca_data = pca_data / scaler.scale_

            if i is 0:
                temp = pca_data
            else:
                temp = np.hstack((temp, pca_data))  # 将数据进行水平合并
        plt.plot(X[6500:8000, 0])
        plt.plot(X_[6500:8000, 0], '--')
        plt.show()

        if os.path.isdir(os.path.join(self.target_addr,
                                      'Scaler_Factors')) is False:  # 建立归一化参数路径
            os.mkdir(r'%s' % os.path.join(
                self.target_addr, 'Scaler_Factors'))  # 目录需要一级一级建立,否则会找不到底层目录
        np.savetxt(
            os.path.join(self.target_addr, 'Scaler_Factors',
                         'pca_layer_MAS_scale.out'), scaler_list)
        '''
        划分训练集和测试集
        '''
        if np.sum(self.report) != temp.shape[0]:
            print("警告!数据报告与数据库内容不符!")
        if np.size(self.report) == 1:
            x_train = temp[:, :]
            x_test = x_train
        else:
            x_train = temp[0:self.seg_boundary, :]
            x_test = temp[self.seg_boundary:, :]
        '''
        存储数据
        '''
        if os.path.isdir(os.path.join(self.target_addr,
                                      'PCA_Data')) is False:  # 路径不存在则新建
            os.mkdir(os.path.join(self.target_addr, 'PCA_Data'))
        np.savetxt(os.path.join(self.target_addr, 'PCA_Data', 'pca_train.out'),
                   x_train)
        np.savetxt(os.path.join(self.target_addr, 'PCA_Data', 'pca_test.out'),
                   x_test)
# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS],
                                       validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
    ('union',
     FeatureUnion(
         transformer_list=[('numeric_features',
                            Pipeline([('selector',
                                       get_numeric_data), ('imputer',
                                                           Imputer())])),
                           ('text_features',
                            Pipeline([('selector', get_text_data),
                                      ('vectorizer',
                                       CountVectorizer(
                                           token_pattern=TOKENS_ALPHANUMERIC,
                                           ngram_range=(1, 2))
                                       ), ('dim_red',
                                           SelectKBest(chi2, chi_k))]))])),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
Esempio n. 33
0
    def _simulate(self, scale):
        """
        Private entrypoint to perform a single simulation

        :param scale: Perform scaling on dataset features
        :returns: Modified dataset including new ground truth labels
        :rtype: :py:obj:`simulation.SimulationResult`
        """
        self.scaler = MaxAbsScaler()
        dataset = self.dataset.copy(deepcopy=True)
        # we need at least one example for each class in each of the two splits
        while True:
            train, test = dataset.split(self.split, shuffle=False)
            break
            if self.no_classes(train) >= 2 and self.no_classes(test) >= 2:
                break
        train_indices = list(map(int, train.instance_names))
        test_indices = list(map(int, test.instance_names))

        self.train, self.test = train, test
        if scale:
            train.features = self.scaler.fit_transform(train.features)
            test.features = self.scaler.transform(test.features)
            dataset.features = self.scaler.transform(dataset.features)

        dataset.infer_domain()

        # learner moves
        self.learner.fit(train)

        ft_names = dataset.protected_attribute_names
        ft_indices = list(
            map(lambda x: not x in ft_names, dataset.feature_names))

        self.Y_predicted = self.learner.predict(dataset.features)
        self.Y_predicted_pr = self.learner.predict_proba(dataset.features)

        # agents move
        at = AgentTransformer(
            self.AgentCl,
            self.learner,
            self.cost_distribution,
            collect_incentive_data=self.collect_incentive_data,
            no_neighbors=self.no_neighbors,
            cost_distribution_dep=self.cost_distribution_dep,
            max_it=self.max_it)

        dataset_ = at.transform(dataset)

        train_ = utils.dataset_from_matrix(
            np.hstack((dataset_.features[train_indices, :],
                       dataset_.labels[train_indices])), dataset)
        test_ = utils.dataset_from_matrix(
            np.hstack((dataset_.features[test_indices, :],
                       dataset_.labels[test_indices])), dataset)

        acc_h = self.learner.accuracy(test)

        # update changed features

        #dataset_ = dataset_from_matrix(np.hstack((np.vstack((train_.features, test_.features)), np.vstack((train_.labels, test_.labels)))), dataset)
        self.Y_new_predicted = self.learner.predict(dataset_.features)
        self.Y_new_predicted_pr = self.learner.predict_proba(dataset_.features)

        acc_h_post = self.learner.accuracy(test_)

        # fit data again, see if accuracy changes
        self.learner.fit(train_)
        acc_h_star_post = self.learner.accuracy(test_)

        # construct datasets for features
        # including predicted label
        if scale:
            dataset.features = self.scaler.inverse_transform(dataset.features)
        dataset_df = dataset.convert_to_dataframe(de_dummy_code=True)[0]
        dataset_df['credit_h'] = pd.Series(self.Y_predicted,
                                           index=dataset_df.index)
        dataset_df['credit_h_pr'] = pd.Series(self.Y_predicted_pr,
                                              index=dataset_df.index)
        if scale:
            dataset_.features = self.scaler.inverse_transform(
                dataset_.features)
        dataset_new_df = dataset_.convert_to_dataframe(de_dummy_code=True)[0]
        dataset_new_df['credit_h'] = pd.Series(self.Y_new_predicted,
                                               index=dataset_new_df.index)
        dataset_new_df['credit_h_pr'] = pd.Series(self.Y_new_predicted_pr,
                                                  index=dataset_new_df.index)

        res = SimulationResult()
        res.df = dataset_df
        res.df_new = dataset_new_df
        res.eps = abs(acc_h_star_post - acc_h_post)
        res.acc_h = acc_h
        res.acc_h_post = acc_h_post
        res.acc_h_star_post = acc_h_star_post
        res.incentives = at.incentives
        return res
    def test_max_abs_scaler_onnx(self, rtol=1e-06, atol=1e-06):
        model = MaxAbsScaler()
        onnx_ml_pred, onnx_pred = self._test_scaler_converter(model)

        # Check that predicted values match
        np.testing.assert_allclose(onnx_ml_pred, onnx_pred, rtol=rtol, atol=atol)
Esempio n. 35
0
def hyperParamSearch(X_train,
                     y_train,
                     X_test,
                     y_test,
                     clf_name="logistic",
                     preprocess='Std',
                     metric='euclidean'):

    # PREPROCESSING = FEATURES SCALING

    if preprocess == 'MaxMin':
        preprocessing = MaxAbsScaler()
        #preprocessing.fit(X_train)
        X_train = preprocessing.fit_transform(X_train)
        X_test = preprocessing.fit_transform(X_test)

    if preprocess == 'Binarization':
        preprocessing = Binarizer()
        #preprocessing.fit(X_train)
        X_train = preprocessing.fit_transform(X_train)
        X_test = preprocessing.fit_transform(X_test)
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'Std':
        preprocessing = StandardScaler(with_mean=False)
        #preprocessing.fit(X_train)
        X_train = preprocessing.fit_transform(X_train)
        X_test = preprocessing.fit_transform(X_test)
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'full_std':
        preprocessing = StandardScaler()
        X_train = preprocessing.fit_transform(X_train.toarray())
        X_test = preprocessing.fit_transform(X_test.toarray())
        print 'preprocess %s completed' % (preprocess)

    if preprocess == 'norm':
        X_train = normalize(X_train.toarray(), axis=0, norm='l1')
        print 'preprocess %s completed' % (preprocess)

    if clf_name == "logistic":
        params = 10**np.linspace(-8, 2, 13)
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for C in params:
            clf = LogisticRegression(n_jobs=-1, penalty='l2', C=C)
            clf.fit(X_train, y_train)
            print 'Logistic regression fitted : %d/%d' % (i + 1, N)

            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            ### ROC Part ###
            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : C=' + str(C) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(C), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "randomforest":
        params = [50, 100, 200]
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for nb in params:
            clf = RandomForestClassifier(n_jobs=-1, n_estimators=nb)
            clf.fit(X_train, y_train)
            print 'randomForest fitted : %d/%d' % (i + 1, N)
            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            ### ROC Part ###
            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : N=' + str(nb) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(nb), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "kNN":
        params = [10, 50, 75, 100]
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for k in params:
            clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, metric=metric)
            clf.fit(X_train, y_train)
            print 'kNN fitted : %d/%d' % (i + 1, N)
            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            ### ROC Part ###
            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : k=' + str(k) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(k), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "naiveBayesM":
        params = [1.]
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for a in params:
            clf = MultinomialNB(alpha=a)
            clf.fit(X_train, y_train)
            print 'naiveBayes fitted : %d/%d' % (i + 1, N)
            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            ### ROC Part ###
            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : a=' + str(a) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(a), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "linSVM":
        params = 10**np.linspace(-5, 1, 6)
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for c in params:
            clf = LinearSVC(C=c, penalty='l2', loss='squared_hinge')
            clf.fit(X_train, y_train)
            print 'linSVM fitted : %d/%d' % (i + 1, N)
            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : C=' + str(c) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(c), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "voting":
        params = ['soft', 'hard']
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for v in params:
            clf1 = LogisticRegression(n_jobs=-1, penalty='l2', C=1e-6)
            clf2 = RandomForestClassifier(n_jobs=-1, n_estimators=50)
            #clf4 = MultinomialNB()
            #clf5 = KNeighborsClassifier(n_jobs=-1, n_neighbors = 10, metric = 'euclidean')

            #clf = VotingClassifier(estimators=[('lr', clf1), ('RF', clf2), ('mnb', clf4), ('kNN', clf5)],
            #                        voting=v)
            clf = VotingClassifier(estimators=[('lr', clf1), ('RF', clf2)],
                                   voting=v)
            clf.fit(X_train, y_train)
            print 'Voting fitted : %d/%d' % (i + 1, N)

            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : v=' + str(v) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(v), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "AdaBoost":
        params = [1, 2, 3, 4, 5]
        N = len(params)
        HSV_tuples = [(x * 1.0 / N, 0.5, 0.5) for x in range(N)]
        RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

        print 'colors generation'
        i = 0
        plt.figure()
        lineWidth = 2

        for m in params:
            clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=m),
                                     n_estimators=20)
            clf.fit(X_train, y_train)
            print 'Voting fitted : %d/%d' % (i + 1, N)

            ### Accuracy Part ###
            y_test_pred = clf.predict(X_test)
            confus = confusion_matrix(y_test, y_test_pred)
            print confus
            True_neg = confus[0, 0]
            True_pos = confus[1, 1]
            # capacite de detecter les csp+
            sensitivity = True_pos * 1.0 / sum(confus[1, ::])
            # capacite de detecter les csp-
            specificity = True_neg * 1.0 / sum(confus[0, ::])
            accuracy = (True_pos + True_neg) * 1.0 / sum(sum(confus))
            print 'accuracy : %s' % (accuracy)
            print 'sensitivity : %s' % (sensitivity)
            print 'specificity : %s' % (specificity)

            proba = clf.predict_proba(X_test)
            ROC_scores = proba[:, 1]  # thresholds for ROC
            fpr, tpr, _ = roc_curve(y_test, ROC_scores, pos_label=1)
            ROC_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     color=RGB_tuples[i],
                     lw=lineWidth,
                     label='parameters : m=' + str(m) +
                     ' (area = %0.2f)' % ROC_auc)
            i += 1
            joblib.dump(clf, clf_name + '_' + str(m), compress=True)
        plt.plot([0, 1], [0, 1],
                 color='navy',
                 lw=lineWidth,
                 linestyle='--',
                 label='Monkey')
        plt.xlim([0.0, 1.05])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves for %s classifier with %s preprocessing' %
                  (clf_name, preprocess))
        plt.legend(loc="lower right", prop={'size': 6})
        plt.show()
        print 'ROC curves done'

    if clf_name == "xgb":
        cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]}
        ind_params = {
            'learning_rate': 0.1,
            'n_estimators': 1000,
            'seed': 0,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'objective': 'binary:logistic'
        }
        optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
                                     cv_params,
                                     scoring='accuracy',
                                     cv=5,
                                     n_jobs=-1)
        optimized_GBM.fit(X_train, y_train)
        print optimized_GBM.grid_scores_
        print 'Cross-validation !'
Esempio n. 36
0
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler

id = [i for i in range(1, 21)]
score = [42, 47, 59, 27, 84, 49, 72, 43, 73, 59,
         52, 49, 89, 27, 54, 49, 92, 45, 37, 95]
data = pd.DataFrame({'ID': id, 'Score': score})
MM_Scaler = MinMaxScaler()
MA_Scaler = MaxAbsScaler()
Std_Scaler = StandardScaler()
data1 = MM_Scaler.fit_transform(data['Score'].values.reshape(-1, 1))
data2 = MA_Scaler.fit_transform(data['Score'].values.reshape(-1, 1))
data3 = Std_Scaler.fit_transform(data['Score'].values.reshape(-1, 1))
data1 = pd.DataFrame({'ID': id, 'Score': data1.flatten()})
data2 = pd.DataFrame({'ID': id, 'Score': data2.flatten()})
data3 = pd.DataFrame({'ID': id, 'Score': data3.flatten()})
print(data1)
print(data2)
print(data3)
def main():
    #args = prepare_optparser()
    # setting:
    #seed = args.seed
    seed = 1
    np.random.seed(seed)
    torch.manual_seed(seed)
    batch_size = 64
    lr = 0.0001
    weight_decay = 5e-4
    model_name = "VAE_test_v50"
    outdir = model_path

    k = 50
    latent = 10
    input_dim = 108633
    encode_dim = [2048, 128]
    decode_dim = [1024]#[2048]
    max_iter = 3000

    dims = [input_dim, latent, encode_dim, decode_dim]
    model = CR_VAE(dims, n_centroids=k)

    normalizer = MaxAbsScaler()

    expected_file_name = "/%s/DNase_test_peaks_top1000.csv" %(data_path)
    dataset = DNaseDataset(expected_file_name,expected_file_name,transpose = False)
    loader_params = {'batch_size': batch_size, 'shuffle': False,'num_workers': 16, 'drop_last': False, 'pin_memory': True}
    testloader = DataLoader(dataset,**loader_params)

    # Training

    model.init_gmm_params(testloader)

    #model.load_state_dict(torch.load('/%s/CR_VAE_model_VAE_test_v40.ckpt' %(model_path))) #*****

    input_files = []
    for iteration in range(0,80):
        input_file_name = "/%s/DNase_train_peaks_top1000_%s.csv" %(data_path,str(iteration))
        expected_file_name = "/%s/DNase_test_peaks_top1000.csv" %(data_path)
        input_files.append([input_file_name,expected_file_name])

    model.fit(input_files,
              lr=lr,
              batch_size = batch_size,
              weight_decay = weight_decay,
              device = device,
              max_iter= max_iter,
              name = model_name,
              outdir = outdir
              )

    torch.save(model.state_dict(), '/%s/CR_VAE_model_%s.ckpt' %(model_path,model_name))
    torch.save(model, '/%s/CR_VAE_model_%s.tmp' %(model_path,model_name)) # Save the whole model

    '''
    # output
    input = pd.read_csv(expected_file_name,sep="\t",header=0,index_col=0)
    model.load_state_dict(torch.load('/%s/CR_VAE_model_%s.ckpt' %(model_path,model_name), map_location=lambda storage, loc: storage),strict=False)
    feature = model.encodeBatch(testloader, device=device, out='z')
    feature = pd.DataFrame(feature)
    feature.index = input.columns.values
    feature.to_csv(os.path.join(outdir, '%s_feature.txt' %(model_name)), sep='\t', header=False,index=True)
    '''

    '''
# Oto następujące kroki programu

# 1. Pobranie bibliotek
import numpy as np
import cPickle as pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler

# 2. Pobranie danych
raw_X = np.load("../../Test Data/2016-03-13/ratio_25/training_features.npy")
raw_X = raw_X.astype("float64")
raw_y = np.load("../../Test Data/2016-03-13/ratio_25/bots.npy")

# 4. Ustalenie normalizacji
normalizer = MaxAbsScaler()


# 5. Lista numerów indeksów
A = [1291, 885, 656, 1527, 1491, 89, 845, 293, 1296, 1076, 1303, 1278, 1185, 705, 184, 634,
	484, 1104, 9, 1422, 623, 525, 1427, 1189, 252, 1055, 1226, 458, 1323, 442, 972,
	1177, 30, 167, 959, 83, 22, 1159, 468, 183, 1324, 420, 1134, 1530, 730, 766,
	640, 606, 750, 816, 823, 1410, 1094, 862, 1210, 1219, 1172, 1218, 1498, 627, 1168,
	1175, 255, 191, 111, 437, 26, 1142, 609, 698, 616, 822, 1438, 861, 1256, 737,
	25, 1520, 1160, 1035, 973, 1111, 327, 1044, 542, 1378, 250, 1307, 452, 613, 625,
	1337, 825, 1261, 241, 407, 745, 733, 1505, 1077, 1435, 1039, 1349, 201, 511, 955,
	1157, 336, 431, 230, 365, 610, 283, 1289, 62, 446, 1023, 423, 757, 641, 333,
	33, 32, 907, 863, 7, 968, 204, 904, 99, 1116, 799, 1030, 740, 851, 805,
	612, 1109, 648, 1020, 1545, 1117, 366, 633, 94, 769, 216, 670, 667, 1139, 887,
	1231, 1453, 13, 1203, 1382, 12, 1114, 1353, 645, 435, 1411, 1536, 101, 245, 1188,
	1412, 776, 1067, 52, 454, 1263, 1450, 704, 240, 351, 80, 1001, 923, 962, 1370,
class SNNAP:
    def __init__(self,
                 clip_runtime=True,
                 feature_selection='chi-squared',
                 top_n=3,
                 k_neighbours=60):
        self._name = 'snnap'
        self._clip_runtime = clip_runtime
        self._feature_selection = feature_selection
        self._top_n = top_n
        self._k_neighbours = k_neighbours
        self._imputer = SimpleImputer()
        self._scaler = MaxAbsScaler()
        self._runtime_scaler = StandardScaler()
        self._models = []
        self._rfr_params = {
            'n_estimators': 100,
            'criterion': 'mse',
            'max_depth': None,
            'min_samples_split': 2
        }

    def get_name(self):
        return self._name

    def fit(self, scenario: ASlibScenario, fold: int, num_instances: int):
        self._num_algorithms = len(scenario.algorithms)
        self._top_n = min(self._num_algorithms, self._top_n)

        # resample `amount_of_training_instances` instances and preprocess them accordingly
        features, performances = self._resample_instances(
            scenario.feature_data.values,
            scenario.performance_data.values,
            num_instances,
            random_state=fold)
        # TODO: apply feature filtering such as chi-squared based selection technique
        features, performances = self._preprocess_scenario(
            scenario, features, performances)

        # train runtime prediction model for each model
        self._models = [
            RandomForestRegressor(random_state=fold, **self._rfr_params)
            for alg in range(self._num_algorithms)
        ]
        for num, model in enumerate(self._models):
            model.fit(features, performances[:, num])

        # build index to retrieve k nearest neighbours based on Jaccard distance of best n solvers
        self._index = BallTree(performances,
                               leaf_size=30,
                               metric='pyfunc',
                               func=SNNAP._top_n_jaccard,
                               metric_params={'top_n': self._top_n})
        self._performances = np.copy(performances)

    def predict(self, features, instance_id: int):
        assert (features.ndim == 1), '`features` must be one dimensional'
        features = np.expand_dims(features, axis=0)
        features = self._imputer.transform(features)
        features = self._scaler.transform(features)

        # predict runtimes and get k nearest neighbours based on Jaccard distance of best n solvers
        predicted = np.asarray([
            model.predict(features) for model in self._models
        ]).reshape(1, -1)
        neighbour_idx = np.squeeze(
            self._index.query(predicted,
                              self._k_neighbours,
                              return_distance=False))

        # find best solver on the instance's k nearest neighbours (best avg. runtime / PAR10 score)
        sub_performances = self._performances[neighbour_idx, :]

        # the summed performance induces a valid ranking
        return np.sum(sub_performances, axis=0)

    def _resample_instances(self, feature_data, performance_data,
                            num_instances, random_state):
        num_instances = min(num_instances, np.size(
            performance_data, axis=0)) if num_instances > 0 else np.size(
                performance_data, axis=0)
        return resample(feature_data,
                        performance_data,
                        n_samples=num_instances,
                        random_state=random_state)

    def _preprocess_scenario(self, scenario: ASlibScenario, features,
                             performances):
        # TODO: paper does not explicitly mention feature imputation & feature scaling
        features = self._imputer.fit_transform(features)
        features = self._scaler.fit_transform(features)

        # train predictors and select algorithms on running time instead of PAR10 if warranted
        if self._clip_runtime:
            performances = np.clip(performances,
                                   a_min=np.NINF,
                                   a_max=scenario.algorithm_cutoff_time)

        # scale performances to zero mean and unitary standard deviation
        performances = self._runtime_scaler.fit_transform(performances)

        return features, performances

    @staticmethod
    def _top_n_jaccard(x, y, **kwargs):
        top_n = kwargs['metric_params']['top_n']
        top_n_1 = set(np.argpartition(x, top_n)[:top_n])
        top_n_2 = set(np.argpartition(y, top_n)[:top_n])

        return len(top_n_1.intersection(top_n_2)) / float(
            len(top_n_1.union(top_n_2)))
Esempio n. 40
0
from estimators import LSHNearestNeighbors
from preprocessors import text_preprocess


if __name__ == "__main__":
    df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t")
    print("Изначальная размерность данных:", df.shape,";", "Количество отелей:", len(df["yaHotelId"].unique()))
    sure_df = df[df["sure"]]
    print(sure_df.shape)
    filtered_values = [value[0] for value in sure_df["yaHotelId"].value_counts().iteritems() if value[1] >= 5]
    filtered_df = sure_df[sure_df["yaHotelId"].isin(filtered_values)]
    print("Получившаяся размерность данных:", filtered_df.shape, ";", "Количество отелей:", len(filtered_df["yaHotelId"].unique()))

    vectorizer = TfidfVectorizer(preprocessor=text_preprocess)
    y = np.array(filtered_df["yaHotelId"])
    X = vectorizer.fit_transform(filtered_df["query"])
    print("X shape:", X.shape)

    scaler = MaxAbsScaler()
    scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    clf = LSHNearestNeighbors(n_estimators=10, n_candidates=100, n_neighbors=9, mode="parzen window")
    clf.fit(X_train, y_train)
    t1 = time.time()
    y_pred = clf.predict(X_test)
    t2 = time.time() - t1
    print("delta time:", t2)
    print("mean time for one query:", t2/X_test.shape[0])
    print("accuracy:", accuracy_score(y_test, y_pred))
Esempio n. 41
0
    model = Sequential()
    model.add(MaxoutDense(100, input_dim=42))
    model.add(Activation('relu'))
    model.add(GaussianNoise(0.00001))
    model.add(Dropout(0.3))

    model.add(MaxoutDense(1, input_dim=100))
    model.add(Activation('sigmoid'))

    #ada = Adagrad(lr=0.001)
    ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True)
    model.compile(optimizer=ada,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    scaler = MaxAbsScaler()
    train_train_scaled = scaler.fit_transform(train_train[features])
    train_test_scaled = scaler.transform(train_test[features])

    model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100)

    train_train_pred = model.predict(train_train_scaled, batch_size=100)
    train_test_pred = model.predict(train_test_scaled, batch_size=100)

    train_score = log_loss(train_train.target.values, train_train_pred)
    test_score = log_loss(train_test.target.values, train_test_pred)

    #test_poly = poly.transform(test[features])
    test_scaled = scaler.transform(test[features])
    test_pred = model.predict(test_scaled, batch_size=100)
Esempio n. 42
0
def _digits_dataset(dtype=np.float32):
    X, y = load_digits(return_X_y=True)
    X = X.astype(dtype, copy=False)
    X = MaxAbsScaler().fit_transform(X)
    return X, y
# classColours['Water'] = [157,212,255]
classColours['VegWater'] = [191, 255, 0]

# define variables for the classification
variables = [
    'VVMin', 'VHMin', 'VVdivVHMin', 'VVMax', 'VHMax', 'VVdivVHMax', 'VVAvg',
    'VHAvg', 'VVdivVHAvg', 'VVStd', 'VHStd', 'VVdivVHStd'
]
# run the classification
classratutils.classifyWithinRAT(outputClumps,
                                classesIntCol,
                                classesNameCol,
                                variables,
                                classifier=classifier,
                                classColours=classColours,
                                preProcessor=MaxAbsScaler())

# export rat column to image

gdalformat = 'GTiff'
datatype = rsgislib.TYPE_8INT
fields = ['OutClass']

rastergis.exportCols2GDALImage(outputClumps, outimage, gdalformat, datatype,
                               fields)

os.system('afplay /System/Library/Sounds/Tink.aiff')
os.system('afplay /System/Library/Sounds/Tink.aiff')

print('It took {0:0.1f} minutes'.format(
    (time.time() - start) / 60))  #time-stam
Esempio n. 44
0
def _max_abs_scaler(column):
    sc = MaxAbsScaler()
    sc.fit(column.reshape(-1,1))
    new_col = sc.transform(column.reshape(-1,1))
    return(new_col)
Esempio n. 45
0
def normalize_features(X: np.array) -> np.array:
    """Normalize features by scaling to [0,1]"""
    scaler = MaxAbsScaler().fit(X)
    return scaler.transform(X)
Trainset2 = xfcss_train.values[rannums,:]
Testset = x_train[test_set,:]
Testset2 = xfcss_train.values[test_set,:]
# Trainy= y_gt[rannums,:]
# Testy = y_gt[test_set,:]
Trainy= y_train[rannums,:]
Testy = y_train[test_set,:]


# In[20]:


# sc_X2 = StandardScaler()
# sc_y = StandardScaler()

sc_X2 = MaxAbsScaler()
sc_y = MaxAbsScaler()


# In[21]:


Xtrainz = Trainset
Xtrainz2 = Trainset2
ytrainz = Trainy
X = Xtrainz
X2 = sc_X2.fit_transform(Xtrainz2)
y = sc_y.fit_transform(ytrainz)


# In[22]:
Esempio n. 47
0
def _mnist_dataset(dtype=np.float32):
    X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
    X = X.astype(dtype, copy=False)
    X = MaxAbsScaler().fit_transform(X)
    return X, y
def main():
    X, y = get_data('../../data/train.csv')
    sclr = MaxAbsScaler()
    X = sclr.fit_transform(X)

    # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+'))
    X_test, y_test = get_data('../../data/val.csv')
    X_test = sclr.transform(X_test)
    X_fin, y_fin = get_data('../../data/test.csv')
    X_fin = sclr.transform(X_fin)
    other, yo = get_data('../../data/other.csv')
    other = sclr.transform(other)

    lin = linear_model.LogisticRegression(
        C=10000,
    )
    # selector = RFE(lin, 21, step=1)
    # selector.fit(X, y)
    # X = selector.transform(X)
    # X_test = selector.transform(X_test)
    # X_fin = selector.transform(X_fin)
    # for i in range(len(selector.support_)):
    #     print i+1, selector.support_[i]

    lin.fit(X, y)
    # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+'))
    x1 = lin.predict_proba(X)
    x1_test = lin.predict_proba(X_test)
    # x1_fin = lin.predict_proba(X_fin)
    # o1 = lin.predict_proba(other)
    print 'lin'
    print metrics.classification_report(y, lin.predict(X))
    print metrics.classification_report(y_test, lin.predict(X_test))
    print metrics.classification_report(y_fin, lin.predict(X_fin))
    roc = lin.predict_proba(X_fin)
    # r = lin.predict(X_test)
    # l1 = []
    # l2 = []
    # for i in range(len(roc)):
    #     if max(roc[i]) > 0.5:
    #         l1.append(y_fin[i])
    #         l2.append(r[i])
    # print 'dsfasdfasd'
    # print metrics.classification_report(l1, l2)
    # return

    fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0)
    fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1)
    fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2)
    plt.plot(fpr_grd0, tpr_grd0, label='NRP')
    plt.plot(fpr_grd1, tpr_grd1, label='RiPP')
    plt.plot(fpr_grd2, tpr_grd2, label='Polyketide')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    # print lin.coef_

    # print sum(lin.predict_proba(X_test)[0])
    svm_model = SVC(
        C=5000,
        # kernel='linear',
        # degree=2,
        coef0=100,
        # probability=True,
        # shrinking=True,
        # class_weight='balanced',
        probability=True,
        # decision_function_shape='ovr'
    )
    svm_model.fit(X, y)
    x2 = svm_model.predict_proba(X)
    x2_test = svm_model.predict_proba(X_test)
    x2_fin = svm_model.predict_proba(X_fin)
    o2 = svm_model.predict_proba(other)
    print 'svm'
    print metrics.classification_report(y, svm_model.predict(X))
    print metrics.classification_report(y_test, svm_model.predict(X_test))