Beispiel #1
1
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler(with_mean=False)
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, with_mean=False)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)
Beispiel #2
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def test_regressors_train():
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators
                  if issubclass(E, RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X = Scaler().fit_transform(X)
    y = Scaler().fit_transform(y)
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            reg = Reg()
        if hasattr(reg, 'alpha'):
            reg.set_params(alpha=0.01)

        # raises error on malformed input for fit
        assert_raises(ValueError, reg.fit, X, y[:-1])
        # fit
        reg.fit(X, y)
        reg.predict(X)
        assert_greater(reg.score(X, y), 0.5)
def test_scaler_without_copy():
    """Check that Scaler.fit does not change input"""
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    X_copy = X.copy()
    Scaler(copy=False).fit(X)
    assert_array_equal(X, X_copy)

    X_csr_copy = X_csr.copy()
    Scaler(with_mean=False, copy=False).fit(X_csr)
    assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
Beispiel #5
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sp.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = Scaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sp.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
def utl_scaleImpute(X_data, p_imputeColumns, p_scaleColumns, p_scalers = None):
    X_data = X_data.copy()
    v_imputeColumns = [column for column in p_imputeColumns if column in X_data.columns]
    v_scaleColumns  = [column for column in p_scaleColumns  if column in X_data.columns]
    
    for column in v_imputeColumns:
        v_values = X_data[column].astype(float).values.reshape(-1, 1)
        if np.isnan(v_values).all():
            X_data[column] = -999
        else:            
            try:
                X_data[column] = Imputer(strategy = 'mean', axis = 0).fit_transform(v_values)
            except:
                values = [np.unique(-999 if np.isnan(ll).all() else ll) for ll in v_values.reshape(1, -1)]
                print(column, values)
                raise
    
    if p_scalers is None:
        p_scalers = {}
        for column in v_scaleColumns:
            v_values = X_data[column].value_counts(dropna = True).index.tolist()
            p_scalers[column] = Scaler().fit(np.array(v_values).reshape(-1, 1))
    
    for column in v_scaleColumns:
        X_data[column] = p_scalers[column].transform(X_data[column].values.reshape(-1, 1) )
    
    return X_data, p_scalers
Beispiel #7
0
def getPreparedData():
    data = getData()
    attributes = data.columns
    # transformacia kategorickych parametrov na celociselne
    data['school'] = data['school'].apply(school_to_numeric)
    data['sex'] = data['sex'].apply(sex_to_numeric)
    data['address'] = data['address'].apply(address_to_numeric)
    data['famsize'] = data['famsize'].apply(famsize_to_numeric)
    data['Pstatus'] = data['Pstatus'].apply(Pstatus_to_numeric)
    data['Mjob'] = data['Mjob'].apply(job_to_numeric)
    data['Fjob'] = data['Fjob'].apply(job_to_numeric)
    data['reason'] = data['reason'].apply(reason_to_numeric)
    data['guardian'] = data['guardian'].apply(guardian_to_numeric)
    data['schoolsup'] = data['schoolsup'].apply(yesno_to_numeric)
    data['famsup'] = data['famsup'].apply(yesno_to_numeric)
    data['paid'] = data['paid'].apply(yesno_to_numeric)
    data['activities'] = data['activities'].apply(yesno_to_numeric)
    data['nursery'] = data['nursery'].apply(yesno_to_numeric)
    data['higher'] = data['higher'].apply(yesno_to_numeric)
    data['internet'] = data['internet'].apply(yesno_to_numeric)
    data['romantic'] = data['romantic'].apply(yesno_to_numeric)

    # transformacia vsetkych dat do intervalu [0,1]
    scaler = Scaler(feature_range=(0, 1)).fit(data)
    data = pd.DataFrame(scaler.transform(data), columns=attributes)
    return np.array(data)
Beispiel #8
0
def test_fit_transform():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for obj in ((Scaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
Beispiel #9
0
def cluster(playlist):
    arq = 'Total ' + playlist + '.csv'
    n_clusters = 0
    Full_data = pd.read_csv(arq)
    Full_data = Full_data.dropna(axis=1, how='all')
    Full_data = Full_data.dropna(axis=0, how='any')
    ID = Full_data['id']
    Mode = Full_data['mode']
    length = Full_data['duration_ms']
    artist = Full_data['artist']
    Full_data = Full_data.drop(
        columns=['track', 'album_id', 'artist', 'id', 'mode'])
    Fdata = Full_data.values
    scaler = Scaler()
    data_u = scaler.fit_transform(Fdata)
    # pca_transf = PCA(0.8)
    # PCA_data = pca_transf.fit_transform(data_u)
    clusterer = AffinityPropagation(random_state=None, preference=-500)
    # clusterer = HDBSCAN(min_cluster_size=20)
    # clusterer = MeanShift()
    labels = clusterer.fit_predict(data_u)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    labels.shape = (len(labels), 1)
    Full_data['cluster'] = labels + 1
    Full_data['id'] = ID
    Full_data['mode'] = Mode
    Full_data['artist'] = artist
    Full_data['duration_ms'] = length
    # Full_data.sort_values(by='cluster')
    Full_data.to_csv('clustered.csv', index=False)
    # sns.pairplot(Full_data, hue="cluster", palette='YlGnBu')
    # plt.show()
    return n_clusters
Beispiel #10
0
def cluster(playlist):
    arq = 'Total ' + playlist + '.csv'
    n_clusters = 0
    Full_data = pd.read_csv(arq)
    Full_data = Full_data.dropna(axis=1, how='all')
    Full_data = Full_data.dropna(axis=0, how='any')
    ID = Full_data['id']
    Mode = Full_data['mode']
    length = Full_data['duration_ms']
    artist = Full_data['artist']
    key = Full_data['key']
    time_signature = Full_data['time_signature']
    Full_data = Full_data.drop(columns=[
        'track', 'album_id', 'artist', 'artist_id', 'id', 'mode',
        'duration_ms', 'key', 'time_signature'
    ])
    Fdata = Full_data.values
    scaler = Scaler()
    data_u = scaler.fit_transform(Fdata)
    clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=35)
    labels = clusterer.fit_predict(data_u)
    score = silhouette_score(data_u, labels)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    labels.shape = (len(labels), 1)
    Full_data['cluster'] = labels + 1
    Full_data['id'] = ID
    Full_data['mode'] = Mode
    Full_data['artist'] = artist
    Full_data['duration_ms'] = length
    Full_data['key'] = key
    Full_data['time_signature'] = time_signature
    Full_data.to_csv('clustered.csv', index=False)
    return n_clusters, score
Beispiel #11
0
    def __init__(self, env):        
        self.env = env 
        
        # sampleing envrionment state in order to featurize it. 
        observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)])
        
        # Feature Preprocessing: Normalize to zero mean and unit variance
        # We use a few samples from the observation space to do this
        self.scaler = Scaler()
        self.scaler.fit(observation_examples)
                
        # Used to convert a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        self.featurizer = FeatureUnion([
                ("rbf1", RBF(gamma=5.0, n_components=100)),
                ("rbf2", RBF(gamma=2.0, n_components=100)),
                ("rbf3", RBF(gamma=1.0, n_components=100)),
                ("rbf4", RBF(gamma=0.5, n_components=100))
                ])
        self.featurizer.fit(self.scaler.transform(observation_examples))

        # action model for SGD regressor
        self.action_models = []
        self.nA = self.env.action_space.n
        
        for na in range(self.nA):
            model = SGD(learning_rate="constant")
            model.partial_fit([self.__featurize_state(self.env.reset())], [0])
            self.action_models.append(model)
Beispiel #12
0
def test_pipeline_methods_preprocessing_svm():
    """Test the various methods of the pipeline (preprocessing + svm)."""
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = Scaler()
    pca = RandomizedPCA(n_components=2, whiten=True)
    clf = SVC(probability=True)

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('scaler', scaler), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Beispiel #13
0
def test_regressors_int():
    # test if regressors can cope with integer labels (by converting them to
    # float)
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators if issubclass(E,
        RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    X = Scaler().fit_transform(X)
    y = np.random.randint(2, size=X.shape[0])
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        if hasattr(reg1, 'alpha'):
            reg1.set_params(alpha=0.01)
            reg2.set_params(alpha=0.01)
        if hasattr(reg1, 'random_state'):
            reg1.set_params(random_state=0)
            reg2.set_params(random_state=0)

        # fit
        reg1.fit(X, y)
        pred1 = reg1.predict(X)
        reg2.fit(X, y.astype(np.float))
        pred2 = reg2.predict(X)
        assert_array_almost_equal(pred1, pred2, 2)
Beispiel #14
0
def load_data(path, label_encoder):
    data = {
        "train_o": [],
        "train_i": [],
        "val_o": [],
        "val_i": [],
        "test_o": [],
        "test_i": []
    }
    for data_type in ["train", "val", "test"]:
        csv_files_path = list(
            paths.list_files(os.path.join(path, data_type), ".csv"))
        for csv_path in tqdm(csv_files_path):
            df = pd.read_csv(csv_path)
            df = df["close"]

            input_data = df.values
            input_data = Scaler().fit_transform(input_data.reshape(
                -1, 1)).flatten()

            output_data = os.path.normpath(csv_path)
            output_data = output_data.split(os.path.sep)[-2]

            data[f"{data_type}_o"].append(label_encoder[output_data])
            data[f"{data_type}_i"].append(input_data)
    for key in data:
        data[key] = np.asarray(data[key])
    return data
Beispiel #15
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    X = Scaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78)
Beispiel #16
0
def dimension(fnames, fp='ECFP', alg='PCA', maximum=int(1e5), ref='GPCR'):
    df = pd.DataFrame()
    for i, fname in enumerate(fnames):
        sub = pd.read_table(fname).dropna(subset=['Smiles'])
        sub = sub[sub.VALID == True]
        if maximum is not None and len(sub) > maximum:
            sub = sub.sample(maximum)
        if ref not in fname:
            sub = sub[sub.DESIRE == True]
        sub = sub.drop_duplicates(subset='Smiles')
        sub['LABEL'] = i
        df = df.append(sub)

    if fp == 'similarity':
        ref = df[(df.LABEL == 0) & (df.DESIRE == True)]
        refs = Predictor.calc_ecfp(ref.Smiles)
        fps = Predictor.calc_ecfp(df.Smiles)
        from rdkit.Chem import DataStructs
        fps = np.array(
            [DataStructs.BulkTanimotoSimilarity(fp, refs) for fp in fps])
    else:
        fp_alg = Predictor.calc_ecfp if fp == 'ECFP' else Predictor.calc_physchem
        fps = fp_alg(df.Smiles)
    fps = Scaler().fit_transform(fps)
    pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2)
    xy = pca.fit_transform(fps)
    df['X'], df['Y'] = xy[:, 0], xy[:, 1]
    if alg == 'PCA':
        ratio = pca.explained_variance_ratio_[:2]
        return df, ratio
    else:
        return df
Beispiel #17
0
def reorder(data):
    F = bm.TSPCF
    scaler = Scaler()
    c_data = data.drop(
        columns=['cluster', 'id', 'artist', 'duration_ms', 'time_signature'])
    c_data = scaler.fit_transform(c_data)
    dist = distance_matrix(c_data, c_data)
    M = 2 * np.max(dist)
    np.fill_diagonal(dist, M)
    L = len(c_data)
    limite = [L * [0], L * [(L - 1)]]
    p = L * [1]
    A, B, C, D = PSO.run(F, 1000, limite, 5, 10, dist, passo=p, permut=True)
    edge_lengths = [dist[-1][0]]
    for i in range(len(A) - 1):
        d = dist[i][i + 1]
        edge_lengths.append(d)
    m_d = max(edge_lengths)
    M = edge_lengths.index(m_d)
    A = np.append(A[M:], A[:M])
    B -= m_d
    try:
        L_range = list(range(len(A)))
        N_order = [list(A).index(val) for val in L_range]
        data['new_order'] = N_order
        data = data.sort_values(by='new_order')
        dataset = data.drop(columns=['new_order'])
        try:
            dataset = data.drop(columns=['Unnamed: 0'])
        except:
            pass
        return dataset
    except:
        return data
Beispiel #18
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
Beispiel #19
0
def Classifier():
    scaler = Scaler()
    es = callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=500,
        verbose=0,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )
    mc = tf.keras.callbacks.ModelCheckpoint(
        'best_model.h5',
        monitor="val_accuracy",
        verbose=0,
        save_best_only=True,
        save_weights_only=False,
        mode="auto",
        save_freq="epoch",
    )
    in_data = 'clustered.csv'
    data = pd.read_csv(in_data)
    data = data.dropna()
    id = data['id']
    data = data.drop(columns=['id'])
    step = 1
    train_label = data['cluster']
    train_data = data.drop(columns=['cluster', 'artist'])
    train_data = train_data.fillna(0)
    train_label = train_label.fillna(0)
    train_data = scaler.fit_transform(train_data)
    n_layers = 2
    model = Sequential()
    model.add(Dense(64, input_dim=len(train_data[0]), activation='relu'))
    for _ in range(n_layers):
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
    model.add(Dense((1 + np.max(train_label.values)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(train_data,
              train_label,
              epochs=1000,
              batch_size=50,
              verbose=2,
              callbacks=[es, mc],
              validation_split=0.2,
              shuffle=True)
    model.load_weights('best_model.h5')
    probs = model.predict(train_data)
    predictions = np.argmax(probs, axis=-1)
    P = pd.DataFrame(probs)
    dp = pd.DataFrame(predictions, columns=['predicted_cluster'])
    dp['predicted_prob'] = np.max(probs, axis=-1)
    P.to_csv('Song_Probs.csv', index=False)
    dp.to_csv('Song_preds.csv', index=False)
Beispiel #20
0
 def __init__(self, set_name='AWA1', batch_size=256, **kwargs):
     super().__init__(set_name, batch_size, **kwargs)
     self.content = [
         'feat', 'label', 'label_emb', 's_cls', 'u_cls', 'cls_emb',
         's_cls_emb', 'u_cls_emb'
     ]
     self.parts = ['training', 'seen', 'unseen']
     self.scaler = Scaler()
     self._init_scaler()
Beispiel #21
0
def test_regressors_train():
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators
                  if issubclass(E, RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X = Scaler().fit_transform(X)
    y = Scaler().fit_transform(y)
    succeeded = True
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            reg = Reg()
        if hasattr(reg, 'alpha'):
            reg.set_params(alpha=0.01)

        # raises error on malformed input for fit
        assert_raises(ValueError, reg.fit, X, y[:-1])
        # fit
        try:
            if Reg in (_PLS, PLSCanonical, PLSRegression, CCA):
                y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
                y_ = y_.T
            else:
                y_ = y
            reg.fit(X, y_)
            reg.predict(X)

            if Reg not in (PLSCanonical, CCA):  # TODO: find out why
                assert_greater(reg.score(X, y_), 0.5)
        except Exception as e:
            print(reg)
            print e
            print
            succeeded = False

    assert_true(succeeded)
Beispiel #22
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    X = Scaler().fit_transform(X)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        assert_greater(zero_one_score(y, y_pred), 0.78)

        # raises error on malformed input for predict
        assert_raises(ValueError, clf.predict, X.T)
        if hasattr(clf, "decision_function"):
            try:
                # decision_function agrees with predict:
                decision = clf.decision_function(X)
                assert_equal(decision.shape, (n_samples, n_labels))
                if not isinstance(clf, BaseLibSVM):
                    # 1on1 of LibSVM works differently
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)
                # raises error on malformed input for decision_function
                assert_raises(ValueError, clf.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(clf, "predict_proba"):
            try:
                # predict_proba agrees with predict:
                y_prob = clf.predict_proba(X)
                assert_equal(y_prob.shape, (n_samples, n_labels))
                assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                # raises error on malformed input for predict_proba
                assert_raises(ValueError, clf.predict_proba, X.T)
            except NotImplementedError:
                pass
Beispiel #23
0
def single_task(feat, alg='RF', reg=False, is_extra=True):
    df = pd.read_table('data/LIGAND_RAW.tsv').dropna(subset=pair[1:2])
    df = df[df[pair[0]] == feat]
    df = df[pair].set_index(pair[1])
    year = df[pair[-1:]].groupby(pair[1]).min().dropna()
    test = year[year[pair[-1]] > 2015].index
    numery = df[pair[2]].groupby(pair[1]).mean().dropna()

    comments = df[(df.Comment.str.contains('Not Active') == True)]
    inhibits = df[(df.Standard_Type == 'Inhibition')
                  & df.Standard_Relation.isin(['<', '<='])]
    relations = df[df.Standard_Type.isin(['EC50', 'IC50', 'Kd', 'Ki'])
                   & df.Standard_Relation.isin(['>', '>='])]
    binary = pd.concat([comments, inhibits, relations], axis=0)
    binary = binary[~binary.index.isin(numery.index)]
    binary[pair[2]] = 3.99
    binary = binary[pair[2]].groupby(binary.index).first()
    df = numery.append(binary) if is_extra else numery
    if not reg:
        df = (df > th).astype(float)
    df = df.sample(len(df))
    print(feat, len(numery[numery >= th]), len(numery[numery < th]),
          len(binary))

    test_ix = set(df.index).intersection(test)
    test = df.loc[test_ix].dropna()
    data = df.drop(test.index)

    test_x = utils.Predictor.calc_fp(
        [Chem.MolFromSimles(mol) for mol in test.index])
    data_x = utils.Predictor.calc_fp(
        [Chem.MolFromSimles(mol) for mol in data.index])
    out = 'output/single/%s_%s_%s' % (alg, 'REG' if reg else 'CLS', feat)
    if alg != 'RF':
        scaler = Scaler()
        scaler.fit(data_x)
        test_x = scaler.transform(test_x)
        data_x = scaler.transform(data_x)
    else:
        X = np.concatenate([data_x, test_x], axis=0)
        y = np.concatenate([data.values, test.values], axis=0)
        Train_RF(X, y[:, 0], out=out + '.pkg', reg=reg)
    data, test = data.to_frame(name='Label'), test.to_frame(name='Label')
    data['Score'], test['Score'] = cross_validation(data_x,
                                                    data.values,
                                                    test_x,
                                                    test.values,
                                                    alg,
                                                    out,
                                                    reg=reg)
    data.to_csv(out + '.cv.tsv', sep='\t')
    test.to_csv(out + '.ind.tsv', sep='\t')
Beispiel #24
0
 def __init__(self):
     """
     Model initialization
     """
     self.model = Pipeline(
         steps=[('imputer', Imputer(strategy='median')),
                ('feature_union',
                 FeatureUnion(
                     n_jobs=1,
                     transformer_list=[
                         ('band_1_pipe',
                          Pipeline(steps=[
                              ('band_1', FunctionTransformer(self.band1)),
                              ('band_1_standard_scale', Scaler()),
                          ])),
                         ('band_2_pipe',
                          Pipeline(steps=[
                              ('band_2', FunctionTransformer(self.band2)),
                              ('band_2_standard_scale', Scaler()),
                          ])), ('inc_angle',
                                FunctionTransformer(self.angle))
                     ])), ('xgboost', XGBClassifier(n_estimators=100))])
Beispiel #25
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    scaler = Scaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = Scaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_scaled_back, X)
def test_regressors_int():
    estimators = all_estimators()
    clustering = [(name, E) for name, E in estimators if issubclass(E,
        ClusterMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X_m = Scaler().fit_transform(X_m)
    X = Scaler().fit_transform(X)
    succeeded = True
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        if hasattr(reg1, 'random_state'):
            reg1.set_params(random_state=0)
            reg2.set_params(random_state=0)
def sparser(dataset):
    d = dataset.drop(columns=['id', 'artist', 'cluster', 'probs'])
    scaler = Scaler()
    scaled_d = scaler.fit_transform(d)
    # scaled_d = scaled_d[-20:]
    dist = np.triu(spy.distance_matrix(scaled_d, scaled_d))
    scaled_dist = scaler.fit_transform(dist)
    arg = np.max(scaled_dist)
    indexes = np.argwhere(dist > (.25 * arg))
    I = indexes.T
    keep = list(set(I[0]))
    sparse_d = dataset.iloc[keep, :]
    sparse_d = sparse_d.sort_values('probs', ascending=False)
    return sparse_d
Beispiel #28
0
def test_pipeline_methods_scaler_svm():
    """Test the various methods of the pipeline (scaler + svm)."""
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Scaler + SVC
    clf = SVC(probability=True)
    scaler = Scaler()
    pipe = Pipeline([('scaler', scaler), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
Beispiel #29
0
def processData(filename, split, trainingSet, testSet):
    DATASET_PATH = './'
    data_path = os.path.join(DATASET_PATH, filename)
    dataset = pd.read_csv(data_path, header=None)
    dataset.columns = [
        "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
        "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"
    ]

    median_bmi = dataset['BMI'].median()
    dataset['BMI'] = dataset['BMI'].replace(to_replace=0, value=median_bmi)

    median_bloodp = dataset['BloodPressure'].median()
    dataset['BloodPressure'] = dataset['BloodPressure'].replace(
        to_replace=0, value=median_bloodp)

    median_plglcconc = dataset['Glucose'].median()
    dataset['Glucose'] = dataset['Glucose'].replace(to_replace=0,
                                                    value=median_plglcconc)

    median_skinthick = dataset['SkinThickness'].median()
    dataset['SkinThickness'] = dataset['SkinThickness'].replace(
        to_replace=0, value=median_skinthick)

    median_twohourserins = dataset['Insulin'].median()
    dataset['Insulin'] = dataset['Insulin'].replace(to_replace=0,
                                                    value=median_twohourserins)
    # print(dataset.head())
    # print(dataset['BMI'][0])
    scaler = Scaler()
    scaler.fit(dataset)
    scaled_dataset = scaler.transform(dataset)
    # print(dataset['BMI'][0])
    df = pd.DataFrame(data=scaled_dataset)
    # print(df.head())
    # datasetlst = pd.
    # datasetlst = list(dataset)
    datasetlst = df.values.tolist()
    # map(datasetlst,dataset.values)
    # datasetlst = list(dataset.values)
    # print(datasetlst[0][5])
    print(len(datasetlst))
    for x in range(0, len(datasetlst) - 1):
        for y in range(9):
            datasetlst[x][y] = float(datasetlst[x][y])
        if random.random() < split:
            trainingSet.append(datasetlst[x])
        else:
            testSet.append(datasetlst[x])
Beispiel #30
0
def SVM_fit(X_in, y_in, X_out, gamma, C):

    M = len(X_in[0])  #Number of features
    seed(time())

    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1

    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in),
                                                int(round(0.1 * len(X_in))))

    shuffle(X_in, y_in)

    X_test = [X_in[i] for i in test_indices]
    y_test = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]

    #scale data first
    scaler = Scaler(copy=False)  #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_test, y_test)
    X_in = scaler.transform(X_in)
    X_test = scaler.transform(X_test)
    X_out = scaler.transform(
        X_out)  #uses the same transformation (same mean_ and std_) fit before

    std_test = X_test.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]

    #Removes feature with null variance
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    svc = svm.SVC(kernel='rbf',
                  C=C,
                  gamma=gamma,
                  verbose=False,
                  cache_size=4092,
                  tol=1e-5)
    svc.fit(X_in, y_in)

    y_out = svc.predict(X_out)
    return y_out