コード例 #1
0
def normalize_cols(tr, val, train, test, cols):
    qnt = QuantileTransformer(output_distribution="normal")
    tr[cols] = qnt.fit_transform(tr[cols]).astype(np.float32)
    val[cols] = qnt.transform(val[cols]).astype(np.float32)

    train[cols] = qnt.fit_transform(train[cols]).astype(np.float32)
    test[cols] = qnt.transform(test[cols]).astype(np.float32)
コード例 #2
0
def predict_er(X, E, window=0.21, step=10, q=5, use_box_cox=True):
    qt = QuantileTransformer(n_quantiles=q, random_state=0)

    lr = HuberRegressor()
    lr.fit(X, E)

    E_pred = lr.predict(X)
    idx_sorted = np.argsort(E)
    X = X[idx_sorted]
    E_pred = E_pred[idx_sorted]
    E = E[idx_sorted]
    
    # use box-cox + quantile transformation so that the data lies uniformly in the interval [0, 1] 
    if use_box_cox:
        E_quantile = qt.fit_transform(np.log(E).reshape(-1, 1)).reshape(-1)
    else:
        E_quantile = qt.fit_transform(E.reshape(-1, 1)).reshape(-1)
    
    E_pred = lr.predict(X)
    x, y = rolling_window_er(E_quantile, (E - E_pred) / E, window=window, step=step)
    if use_box_cox:
        x = np.exp(qt.inverse_transform(x.reshape(-1, 1)).reshape(-1))
    else:
        x = qt.inverse_transform(x.reshape(-1, 1)).reshape(-1)
        
        
    return x, y
def train():
    bankdata = pd.read_csv('data/trainingbin_.csv')
    X = bankdata.drop('class_label', axis=1)
    y = bankdata['class_label']
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    scaler = QuantileTransformer(output_distribution='uniform')
    X_train = scaler.fit_transform(X_train)
    #y_train= scaler.fit_transform(y_train)
    X_test = scaler.fit_transform(X_test)
    #y_test= scaler.fit_transform(y_test)
    #from sklearn.ensemble import RandomForestClassifier
    clf = svm.SVC(kernel='linear', C=512.0, g=0.0078125)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn import metrics
    print(confusion_matrix(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    cnf_matrix = confusion_matrix(y_test, y_pred)
    #print(classification_report(y_test,y_pred))
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)
    # Specificity or true negative rate
    TNR = TN / (TN + FP)
    # Precision or positive predictive value
    PPV = TP / (TP + FP)
    # Negative predictive value
    NPV = TN / (TN + FN)
    # Fall out or false positive rate
    FPR = FP / (FP + TN)
    # False negative rate
    FNR = FN / (TP + FN)
    # False discovery rate
    FDR = FP / (TP + FP)

    # Overall accuracy
    ACC = (TP + TN) / (TP + FP + FN + TN)

    print("FPR:", sum(FPR) / 55)
    print("FNR:", sum(FNR) / 55)
    print("ACC:", 100 * (sum(ACC) / 55))
    print(classification_report(y_test, y_pred))
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
コード例 #4
0
def test_clustering(n_runs=20, alpha=0.5):
    nmis_both = []
    nmis_attributes = []
    nmis_structure = []
    for i in range(n_runs):
        print("Run number {0}".format(i))
        ensemble_density_huge('file.csv', "'\t'")
        dist_dense = pd.read_csv("./matrix.csv", delimiter="\t",
                                 header=None).values
        dist_dense = dist_dense[:, :-1]

        sims_attributes = ensemble_attributes("file_attributes.csv", "\t")
        sim_attributes = pd.read_csv("./matrix_uet.csv",
                                     delimiter="\t",
                                     header=None).values
        sim_attributes = sim_attributes[:, :-1]

        dist_attributes = sim_to_dist(np.array(sim_attributes))
        dist = alpha * dist_dense + (1 - alpha) * dist_attributes
        dist = dist / 2
        model_kmeans = KMeans(n_clusters=len(set(true)))
        scaler = QuantileTransformer(n_quantiles=10)
        dist_scaled = scaler.fit_transform(dist)
        dist_dense_scaled = scaler.fit_transform(dist_dense)
        dist_attributes_scaled = scaler.fit_transform(dist_attributes)
        results_dense = TSNE(
            metric="precomputed").fit_transform(dist_dense_scaled)

        results_dense_both = TSNE(
            metric="precomputed").fit_transform(dist_scaled)
        results_dense_attributes = TSNE(
            metric="precomputed").fit_transform(dist_attributes_scaled)
        labels_dense_kmeans_both = model_kmeans.fit_predict(results_dense_both)
        labels_dense_kmeans_attributes = model_kmeans.fit_predict(
            results_dense_attributes)
        labels_dense_kmeans_structure = model_kmeans.fit_predict(results_dense)

        nmis_both.append(
            nmi(labels_dense_kmeans_both, true, average_method="arithmetic"))
        nmis_attributes.append(
            nmi(labels_dense_kmeans_attributes,
                true,
                average_method="arithmetic"))
        nmis_structure.append(
            nmi(labels_dense_kmeans_structure,
                true,
                average_method="arithmetic"))
    print("Structure : {0}, {1}".format(np.mean(nmis_structure),
                                        np.std(nmis_structure)))
    print("Attributes : {0}, {1}".format(np.mean(nmis_attributes),
                                         np.std(nmis_attributes)))
    print("Both : {0}, {1}".format(np.mean(nmis_both), np.std(nmis_both)))

    return (nmis_structure, nmis_attributes, nmis_both)
コード例 #5
0
def _test_quantile_transformer(shape, n_quantiles):
    from sklearn.preprocessing import QuantileTransformer

    st_helper = SklearnTestHelper()

    rng = np.random.RandomState(0)
    data = np.sort(rng.normal(loc=0.5, scale=0.25, size=shape), axis=0)

    qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=0)
    qt.fit_transform(data)

    dshape = (relay.Any(), len(data[0]))
    _test_model_impl(st_helper, qt, dshape, data.astype("float32"))
コード例 #6
0
ファイル: 1505027.py プロジェクト: snat1505027/ML-Offlines
def loadCreditCardData(label):
    dataframe = pd.read_csv("Data/creditcardfraud/creditcard.csv",
                            delimiter=",",
                            engine='python')

    dataframe = dataframe.dropna(axis=0, subset=[label])
    dataframe = dataframe.reset_index(drop=True)

    quantile_scaler = QuantileTransformer(random_state=0,
                                          output_distribution='uniform')

    Scaled_amount = quantile_scaler.fit_transform(
        dataframe['Amount'].values.reshape(-1, 1))
    Scaled_time = quantile_scaler.fit_transform(
        dataframe['Time'].values.reshape(-1, 1))

    dataframe.drop(['Time', 'Amount'], axis=1, inplace=True)

    dataframe.insert(0, 'Amount', Scaled_amount)
    dataframe.insert(1, 'Time', Scaled_time)

    # target_col = label
    # other_cols = [x for x in dataframe.columns if x not in target_col]

    Y_true = dataframe.loc[dataframe[label] == 1]
    Y_false = dataframe.loc[dataframe[label] == 0]
    Y_false = Y_false.sample(n=20000)
    subdata = Y_true.append(Y_false, ignore_index=True)
    dataframe = subdata.sample(frac=1)

    dataframe = dataframe.reset_index(drop=True)

    print('No Frauds', round(dataframe[label].value_counts()[0]),
          ' of the dataset')
    print('Frauds', round(dataframe[label].value_counts()[1]),
          ' of the dataset')

    numeric_cols = dataframe._get_numeric_data().columns
    # cat_cols = [x for x in dataframe.columns if x not in numeric_cols]

    for col in numeric_cols:
        median = dataframe[col].median()
        dataframe[col].fillna(median, inplace=True)

    for col in numeric_cols:
        if (col == label):
            continue
        est = KBinsDiscretizer(n_bins=13, encode='ordinal', strategy='uniform')
        dataframe[col] = est.fit_transform(dataframe[[col]])

    return dataframe
def train_neural_network(x):

    logits = recurrent_neural_network(x)
    prediction = tf.nn.softmax(logits)
    #prediction = recurrent_neural_network(x)
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())

        #from sklearn.model_selection import train_test_split
        #epoch_x, test_x, epoch_y, test_y = train_test_split(x1, y1, test_size = 0.25)
        #print(epoch_x.shape)
        epoch_x1, epoch_y1, test_x1, test_y1 = readCSV(train_path)
        sc = QuantileTransformer(output_distribution='uniform')
        epoch_x1 = sc.fit_transform(epoch_x1)
        epoch_y1 = sc.fit_transform(epoch_y1)
        test_x1 = sc.fit_transform(test_x1)
        test_y1 = sc.fit_transform(test_y1)
        epoch_x1 = np.split(epoch_x1, 55)
        #print(epoch_x1)
        epoch_y1 = np.split(epoch_y1, 55)
        for epoch in range(hm_epochs):
            epoch_loss = 0
            #epoch_y=np.split(epoch_y,20)
            for i, j in zip(epoch_x1, epoch_y1):
                e_x = i
                e_y = j
                e_x = e_x.reshape((batch_size, n_chunks, chunk_size))
                #e_x = .reshape(e_x, shape=[batch_size,n_chunks,chunk_size])

                _, c = sess.run([optimizer, cost], feed_dict={x: e_x, y: e_y})
                epoch_loss += c

            print('Epoch', epoch, 'completed out of', hm_epochs, 'loss:',
                  epoch_loss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))

        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print(
            'Accuracy:',
            accuracy.eval({
                x: test_x1.reshape((330, n_chunks, chunk_size)),
                y: test_y1
            }))
        '''pred = prediction.eval({x: test_x})
コード例 #8
0
def test_transform_default_params():
    N = 1000
    rng = np.random.RandomState(22922)
    data = np.stack([
        rng.lognormal(10, 5, N),
        rng.uniform(-10, 0, N),
        rng.normal(10, 10, N),
        rng.normal(-1, 1, N)
    ],
                    axis=1)
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=3434)
    data_transformed_sk = transformer.fit_transform(data)
    data_double_transformed_sk = transformer.inverse_transform(
        data_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_sk)

    transformer_tf = QuantileTransformerTF(transformer)
    data_transformed_tf = transformer_tf.transform(data.astype(np.float64),
                                                   False)
    data_double_transformed_tf = transformer_tf.transform(
        data_transformed_tf, True)

    with tf.Session() as session:
        data_transformed_tf_val, data_double_transformed_tf_val = session.run(
            [data_transformed_tf, data_double_transformed_tf])
    np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val)
    np.testing.assert_allclose(data, data_double_transformed_tf_val)
コード例 #9
0
def transform_spectrum(spectrum,
                       baseline_max=None,
                       baseline_min=None,
                       qt=None):
    if spectrum.ndim == 1:
        spectrum = spectrum.reshape(-1, len(spectrum))
    local_spectrum, _, _ = normalise_spectrum(spectrum)
    min_spectrum = spectrum.min(axis=1)
    max_spectrum = spectrum.max(axis=1)
    if (baseline_max is None) or (baseline_min is None):
        baseline_norm, baseline_max, baseline_min = normalise(
            min_spectrum, min_spectrum.max(), min_spectrum.min())
    else:
        baseline_norm, baseline_max, baseline_min = normalise(
            min_spectrum, baseline_max, baseline_min)
    baseline_norm *= 2
    height = max_spectrum - min_spectrum
    if qt is None:
        qt = QuantileTransformer(n_quantiles=50, random_state=0)
        height_norm = qt.fit_transform(height.reshape(-1, 1))
    else:
        height_norm = qt.transform(height.reshape(-1, 1))
    height_norm *= 2
    height_norm += 1

    t_spectrum = np.transpose(local_spectrum.T * height_norm.flatten() +
                              baseline_norm.T)

    return t_spectrum, baseline_max, baseline_min, qt
コード例 #10
0
ファイル: Offbeatr.py プロジェクト: jeffrose20/Offbeat_App
 def get_songs(self,
               songfile=None,
               host='35.196.88.209',
               user='******',
               password='******',
               database='SPOTIFY'):
     """As a security measure, IP must be whitelisted in Google cloud prior to 
     getting song data"""
     if not songfile:
         conn = pymysql.connect(host='35.196.88.209', user='******', \
                                password='******', database='SPOTIFY')
         query = """
                 SELECT * 
                 FROM songs
                 """
         print('fetching songs from database')
         self.songs = pd.read_sql(query, conn)
         conn.close()
     else:
         print('reading songs from local file')
         self.songs = pd.read_csv(songfile, skiprows=[1])
     self.N = self.songs.shape[0]
     self.songs_labeled_ = self.songs[['song_id']].copy()
     qt = QuantileTransformer(output_distribution='normal',
                              random_state=self.rng)
     self.raw_data = qt.fit_transform(np.array(self.songs[self.keepers]))
     dump(qt, 'qt.pickle')
     print("Saved transformer to file: 'qt.pickle'")
コード例 #11
0
class LinearClassification:
    def __init__(self, random_seed=82):
        self.random_seed = random_seed
        self.transformer_params = {'random_state': self.random_seed + 1}
        self.transformer = QuantileTransformer(**self.transformer_params)
        self.model_params = {
            'penalty': 'l2',
            'C': 5.0,
            'class_weight': 'balanced',
            'random_state': self.random_seed + 2,
            'solver': 'saga',
            'max_iter': 250,
            'n_jobs': 4,
        }
        self.model = None

    def train(self, data, label):
        self.fillna_values = data.mean()
        data = self.transformer.fit_transform(data.fillna(self.fillna_values))
        self.model = LogisticRegression(**self.model_params)
        self.model.fit(data, label)

    def predict(self, data):
        data = self.transformer.transform(data.fillna(self.fillna_values))
        preds = self.model.predict_proba(data)[:, 1]

        return preds
コード例 #12
0
def get_full_dataset(path, dtype=np.float32):
    df_list = []
    for file_name in os.listdir(path):
        if file_name.endswith(".csv"):
            df_list.append(load_file(os.path.join(path, file_name), dtype))
    data_full = pd.concat(df_list, ignore_index=True, join="inner",
                          copy=False)[ONE_AND_TRUE_COLUMNS_ORDER]
    # Since we'll be doing quite some parameter search,
    # we'll ignore the test for now
    data_train, data_val, _ = split(data_full)
    columns_to_rescale = dll_columns + feature_columns
    scaler = QuantileTransformer(output_distribution="normal",
                                 n_quantiles=int(1e5),
                                 subsample=int(1e10),
                                 copy=False)
    # TODO(kazeev) does it copy?
    print("It will now print a warning, but still work. Most likely due to"
          " a copy of data_full made by train_test_split, but feel free to"
          " investigate.")
    data_train.loc[:, columns_to_rescale] = scaler.fit_transform(
        data_train.loc[:, columns_to_rescale].values).astype(dtype)
    data_val.loc[:, columns_to_rescale] = scaler.transform(
        data_val.loc[:, columns_to_rescale].values).astype(dtype)

    return data_train, data_val, scaler
コード例 #13
0
def quantile_transform2(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, sep=",", header=0)
        qt = QuantileTransformer(n_quantiles=50,
                                 random_state=0,
                                 output_distribution="normal")
        for feature in df.columns.values:
            #todo aggironare con il while qua...
            if feature not in [
                    'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close'
            ]:
                stat, p = stats.normaltest(df[feature])
                if p <= 0.05:
                    print('transforming:' + feature)
                    p = -1
                    n_t = 1
                    while p <= 0.05:
                        qt = QuantileTransformer(n_quantiles=n_t,
                                                 random_state=0,
                                                 output_distribution="normal")
                        quanrtil = qt.fit_transform(df[feature].values.reshape(
                            -1, 1))
                        new_values = pd.Series(quanrtil.reshape(-1))
                        stat, p = stats.normaltest(new_values)
                        if p > 0.05:
                            df[feature] = pd.Series(new_values)
                            print('num_quantiles:' + str(n_t))
                        elif (n_t < 100):
                            n_t += 1
                        else:
                            break

        df.to_csv(output_path + crypto, sep=",", index=False)
コード例 #14
0
def quantile_transform(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        print(crypto)
        df = pd.read_csv(input_path + crypto, sep=",", header=0)

        for feature in df.columns.values:
            if feature != "Date":
                print('transforming:' + feature)
                p = -1
                n_t = 1
                while p <= 0.05:
                    qt = QuantileTransformer(n_quantiles=n_t,
                                             random_state=0,
                                             output_distribution="normal")
                    quanrtil = qt.fit_transform(df[feature].values.reshape(
                        -1, 1))
                    new_values = pd.Series(quanrtil.reshape(-1))
                    stat, p = stats.normaltest(new_values)
                    if p > 0.05:
                        df[feature] = pd.Series(new_values)
                        print('num_quantiles:' + str(n_t))
                    else:
                        n_t += 1
        df.to_csv(output_path + crypto, sep=",", index=False)
コード例 #15
0
    def predict_on_nrc_liwc(self, raw_test_data):
        # This method generates predictions based on NRC/LIWC data
        liwc_test_df = raw_test_data.get_liwc().copy()
        nrc_test_df = raw_test_data.get_nrc().copy()
        liwc_test_df.columns = [x.lower() for x in liwc_test_df.columns]
        nrc_test_df.columns = [x.lower() for x in nrc_test_df.columns]
        test_users = raw_test_data.get_profiles()['userid']
        liwc_data = pd.merge(test_users, liwc_test_df, on="userid")

        # We make of fusion of NRC and LIWC data
        nrc_liwc_data = pd.merge(liwc_data, nrc_test_df, on="userid")

        X_test = nrc_liwc_data.drop(['userid'], axis=1)

        # We scale the data using sklearn's QuantileTransformer
        q_scaler = QuantileTransformer(100)
        X_scaled = q_scaler.fit_transform(X_test)

        prediction = self._nrc_liwc_model.predict_classes(X_scaled)

        print("NRC LIWC prediction: ", len(prediction))

        df = pd.DataFrame()
        df['userid'] = nrc_liwc_data['userid']
        df['gender'] = prediction
        return df
コード例 #16
0
def my_quantile_transform(train_targets, non_train_targets):
    transformer = QuantileTransformer(output_distribution="uniform")
    train_targets[train_targets.columns] = transformer.fit_transform(
        train_targets.values)
    non_train_targets[train_targets.columns] = transformer.transform(
        non_train_targets.values)
    return train_targets, non_train_targets
コード例 #17
0
def rankgauss(X_train, y, X_test, id_test):
    rg = QuantileTransformer(n_quantiles=100,
                             random_state=0,
                             output_distribution='normal')
    X_train.iloc[:, :] = rg.fit_transform(X_train)
    X_test.iloc[:, :] = rg.transform(X_test)
    return X_train, y, X_test, id_test
コード例 #18
0
def get_and_process_boston_dataset(random_state: int = 42,
                                   normalize_y: bool = True,
                                   normalize_X: bool = True):
    # Load
    X, y = load_boston(return_X_y=True)
    # Split into train/test
    X_train, X_test, y_train, y_test = continious_stratification(
        X, y, random_state=random_state)
    # Normalize target
    if normalize_y:
        tgt_trans = QuantileTransformer(n_quantiles=300,
                                        output_distribution="normal",
                                        random_state=random_state)
        y_train = tgt_trans.fit_transform(y_train[:, None])
        y_test = tgt_trans.transform(y_test[:, None])
    else:
        y_train = y_train[:, None]
        y_test = y_test[:, None]
    # Normalize features
    if normalize_X:
        feature_trans = StandardScaler()
        X_train = feature_trans.fit_transform(X_train)
        X_test = feature_trans.transform(X_test)

    return X_train, X_test, y_train, y_test
コード例 #19
0
def quantile_transformer(data):

    data_columns = list(data.columns)
    transformer = QuantileTransformer()
    data = transformer.fit_transform(data)
    data = pd.DataFrame(data, columns=data_columns)
    return data
コード例 #20
0
    def fit(self, X: list[Config[T]], y: list[Performance]) -> None:
        """
        Uses the provided data to fit a model which is able to predict the
        target variables from the input.

        Args:
            X: The input configurations.
            y: The performance values associated with the input configurations.
        """
        y_numpy = self.performance_transformer.fit_transform(y)

        # If we apply any normalization, we do so independently per dataset
        if self.output_normalization is not None:
            # Initialize the transformer
            if self.output_normalization == "quantile":
                transformer = QuantileTransformer()
            else:
                transformer = StandardScaler()

            # Assign indices according to datasets
            encoder = LabelEncoder()
            dataset_indices = encoder.fit_transform(
                [x.dataset.name() for x in X])

            # Then, iterate over datasets and transform the objectives
            result = np.empty_like(y_numpy)
            for i in range(len(encoder.classes_)):
                mask = dataset_indices == i
                result[mask] = transformer.fit_transform(y_numpy[mask])

            # And eventually re-assign the result
            y_numpy = result

        self._fit(X, y_numpy)
コード例 #21
0
def test_transform():
    N = 10000
    rng = np.random.RandomState(223532)
    data_2 = rng.normal(0, 1, N // 4)
    data = np.stack([
        rng.uniform(-10, 10, N),
        rng.lognormal(10, 5, N),
        np.concatenate([data_2] * 4),
        rng.normal(-1, 1, N)
    ],
                    axis=1)
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=34214)
    data_transformed_sk = transformer.fit_transform(data)
    data_double_transformed_sk = transformer.inverse_transform(
        data_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_sk)

    # To test that QuantileTransformerTF picks up the right columns
    # we ask it only for [1, 2, 3] columns and when testing use data[:, 1:]
    transformer_tf = QuantileTransformerTF(transformer, [1, 2, 3],
                                           dtype=np.float64)
    data_transformed_tf = transformer_tf.transform(
        data[:, 1:].astype(np.float64), False)
    data_double_transformed_tf = transformer_tf.inverse_transform(
        data_transformed_tf)

    with tf.Session() as session:
        data_transformed_tf_val, data_double_transformed_tf_val = session.run(
            [data_transformed_tf, data_double_transformed_tf])
    np.testing.assert_allclose(data_transformed_sk[:, 1:],
                               data_transformed_tf_val)
    np.testing.assert_allclose(data[:, 1:], data_double_transformed_tf_val)
コード例 #22
0
def rankGauss(train, test, col):
    transformer = QuantileTransformer(n_quantiles=100,
                                      random_state=0,
                                      output_distribution="normal")
    train[col] = transformer.fit_transform(train[col].values)
    test[col] = transformer.transform(test[col].values)
    return train, test
コード例 #23
0
def normalize(trn, val, test):
    """
    Performs quantile normalization on the train, test and validation data. The QuantileTransformer
    is fitted on the train data, and transformed on test and validation data.
    
    Args:
            trn: train data - pandas dataframe.
            val: validation data - pandas dataframe.
            test: test data - pandas dataframe.
    
    Returns:
            trn_norm: normalized train data - pandas dataframe.
            val_norm: normalized validation - pandas dataframe.
            test_norm: normalized test data - pandas dataframe.
    """
    norm_model = QuantileTransformer(n_quantiles=100,
                                     random_state=0,
                                     output_distribution="normal")
    trn_norm = pd.DataFrame(norm_model.fit_transform(trn),
                            index=trn.index,
                            columns=trn.columns)
    val_norm = pd.DataFrame(norm_model.transform(val),
                            index=val.index,
                            columns=val.columns)
    tst_norm = pd.DataFrame(norm_model.transform(test),
                            index=test.index,
                            columns=test.columns)
    return trn_norm, val_norm, tst_norm
コード例 #24
0
class LinearRegression:
    def __init__(self, random_seed=82):
        self.random_seed = random_seed
        self.transformer_params = {'random_state': self.random_seed + 1}
        self.transformer = QuantileTransformer(**self.transformer_params)
        self.model_params = {'max_iter': 1000, 'random_state': self.random_seed + 2}
        self.model = None

    def train(self, data, label, ds=None, train_tl=200):
        start_time = time.time()
        self.fillna_values = data.mean()
        data.fillna(self.fillna_values, inplace=True)
        self.model = Lasso(**self.model_params, alpha=0.1)
        self.model.fit(self.transformer.fit_transform(data), label)
        model_train_time = time.time() - start_time

        try:  # search
            if ds is not None:
                data['ds'] = ds
                cv = TimeSeriesCV(n_splits=min(6, data.shape[0] // 30))
                folds = list(cv.split(data))
                data.drop('ds', axis=1, inplace=True)
            else:
                cv = KFold(n_splits=3, shuffle=True, random_state=self.random_seed + 3)
                folds = list(cv.split(data))

            n_alphas = int(min(35, (train_tl - 2 * model_train_time) / (model_train_time * len(folds))))
            lasso_alpha, lasso_rmse = self._search_params(data, label, model=Lasso, search_space=np.logspace(-2, 0, n_alphas), folds=folds)
            Model, best_alpha = Lasso, lasso_alpha

            n_alphas = int(min(10, (train_tl - (time.time() - start_time) - model_train_time) / (model_train_time * 1.5 * len(folds))))
            if n_alphas > 2:
                ridge_alpha, ridge_rmse = self._search_params(data, label, model=Ridge, search_space=np.logspace(-2, 2, n_alphas), folds=folds)
                if lasso_rmse * 0.99 < ridge_rmse:
                    best_alpha = ridge_alpha
                    Model = Ridge

            self.model_params.update({'alpha': best_alpha, 'random_state': self.random_seed + 4})
            self.model = Model(**self.model_params)
            self.model.fit(self.transformer.transform(data), label)
        except:
            pass

    def predict(self, data):
        data = self.transformer.transform(data.fillna(self.fillna_values))
        preds = self.model.predict(data)

        return preds

    def _search_params(self, data, label, model, search_space, folds=3, scorer=None):
        scorer = scorer or make_scorer(_rmse, greater_is_better=False)
        pipeline = Pipeline([
            ('t', QuantileTransformer(**self.transformer_params)),
            ('m', model(**self.model_params))
        ])
        gs = GridSearchCV(pipeline, {'m__alpha': search_space}, scoring=scorer, cv=folds)
        gs.fit(data, label)

        return gs.best_params_['m__alpha'], gs.best_score_
コード例 #25
0
def normal_transform(df, scale_target):
    qt = QuantileTransformer(output_distribution="normal")
    if not scale_target:
        target = df['MedHouseVal']
    df = pd.DataFrame(qt.fit_transform(df), columns=df.columns)
    if not scale_target:
        df['MedHouseVal'] = target
    return df
コード例 #26
0
def bad_quantile_transform(train_targets, non_train_targets):
    transformer = QuantileTransformer(output_distribution="normal",
                                      n_quantiles=100)
    train_targets[train_targets.columns] = transformer.fit_transform(
        train_targets.values)
    non_train_targets[train_targets.columns] = transformer.transform(
        non_train_targets.values)
    return train_targets, non_train_targets, "i am the wrong type for an inversion result"
コード例 #27
0
 def _scale(self,stsc,lab,dev=True):
     ctrans =  ColumnTransformer(
                 [('scale_all', StandardScaler(), stsc),
                  ('cats', OneHotEncoder(categories='auto'), lab)])
     
    # xtsc = StandardScaler()
     xtsc = QuantileTransformer(output_distribution='normal', random_state=self.rand)
    # ytsc = StandardScaler()
     ytsc = QuantileTransformer(output_distribution='normal', random_state=self.rand)
     mmx = MinMaxScaler(feature_range=(-1,1))
     mmy = MinMaxScaler(feature_range=(-1,1))
     #wtsc = StandardScaler(with_mean=False)
     
     self.X_train_ft = ctrans.fit_transform(self.X_train_ft)
     self.X_test_ft = ctrans.transform(self.X_test_ft)
     self.X_train_ts = xtsc.fit_transform(self.X_train_ts)
     self.X_test_ts = xtsc.transform(self.X_test_ts)
     
     self.X_train_ts = mmx.fit_transform(self.X_train_ts)
     self.X_test_ts = mmx.transform(self.X_test_ts)
     
     
     if self.ts:
         self.x_train = self.X_train_ts
         self.x_test = self.X_test_ts
     else:
         self.x_train = np.concatenate([self.X_train_ft, self.X_train_ts], axis=1)
         self.x_test = np.concatenate([self.X_test_ft, self.X_test_ts], axis=1)
     
    # self.train_wt = wtsc.fit_transform(self.train_wt)
    # self.test_wt = wtsc.transform(self.test_wt)
     
     self.y_train_sc = ytsc.fit_transform(self.y_train)
     self.y_test_sc = ytsc.transform(self.y_test)
     self.y_train_sc = mmy.fit_transform(self.y_train_sc)
     self.y_test_sc = mmy.transform(self.y_test_sc)
     
     if self.wt:
         self.y_train = np.concatenate([self.y_train,self.train_wt],axis=1)
         self.y_test = np.concatenate([self.y_test,self.test_wt],axis=1)
     
     self.xtrans_sc = xtsc
     self.xtrans_mm = mmx
     self.ytrans_mm = mmy
     self.ytrans_sc = ytsc
     self.ftrans = ctrans
コード例 #28
0
def folder_readpile(path, npset, samples, list_index):
    '''Importer for files stored in a folder in which we have more than one strand. It quantile normalizes the data to make it comparable between samples.
	-path: Location of the files.
	-npset: np.array to add the files in the path.
	-samples: list of samples selected. If not it will take everything in the file.
	-list_index: stores the samples id to recognize each row of the npset.'''
    for file in os.listdir(path):
        if samples is not None:
            if file.replace('_read.pile', "") in samples:
                tmp = pd.read_csv(path + file, sep='\t')
                if 'Unnamed: 3' in tmp.columns:
                    tmp = tmp.drop(['Unnamed: 3'], axis=1)
                if 'pos' in tmp.columns:
                    tmp = tmp.drop(['pos'], axis=1)
                qqnorm = QuantileTransformer(n_quantiles=1000,
                                             output_distribution='uniform',
                                             random_state=0)
                tmpnorm = qqnorm.fit_transform(tmp)
                tmp.loc[:, :] = tmpnorm
                tmp_np = np.reshape(tmp.as_matrix(),
                                    (1, tmp.shape[0], tmp.shape[1]))
            else:
                continue
        elif samples is None:
            tmp = pd.read_csv(path + file, sep='\t')
            if 'Unnamed: 3' in tmp.columns:
                tmp = tmp.drop(['Unnamed: 3'], axis=1)
            if 'pos' in tmp.columns:
                tmp = tmp.drop(['pos'], axis=1)
            qqnorm = QuantileTransformer(n_quantiles=1000,
                                         output_distribution='uniform',
                                         random_state=0)
            tmpnorm = qqnorm.fit_transform(tmp)
            tmp.loc[:, :] = tmpnorm
            tmp_np = np.reshape(tmp.as_matrix(),
                                (1, tmp.shape[0], tmp.shape[1]))
        if npset is None:
            npset = tmp_np
            list_index.append(file.replace('_read.pile', ""))
        elif file.replace('_read.pile', "") in samples:
            npset = np.append(npset, tmp_np, axis=0)
            list_index.append(file.replace('_read.pile', ""))
        elif samples is None:
            npset = np.append(npset, tmp_np, axis=0)
            list_index.append(file.replace('_read.pile', ""))
    return npset, list_index
コード例 #29
0
def quantile_transform_no_invert(train_targets, non_train_targets):
    transformer = QuantileTransformer(output_distribution="normal",
                                      n_quantiles=100)
    train_targets[train_targets.columns] = transformer.fit_transform(
        train_targets.values)
    non_train_targets[train_targets.columns] = transformer.transform(
        non_train_targets.values)
    return train_targets, non_train_targets
コード例 #30
0
def map_2_uniform(X):
    '''Maps N*M data from any distribution to as close to a G a uniform distribution with values between 0 and 1'''
    quantile_transformer = QuantileTransformer(random_state=1993)
    data = [
        quantile_transformer.fit_transform((X[i].reshape(1, -1)))
        for i in range(0, X.shape[0])
    ]
    return np.array(data).astype('float32')