Exemple #1
0
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute
)
print("Categorical features: {}".format(categorical_indicator))
transformer = compose.ColumnTransformer(
    [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)])
X = transformer.fit_transform(X)
clf.fit(X, y)

############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.

# Get a task
task = openml.tasks.get_task(403)

# Build any classifier or pipeline
clf = tree.ExtraTreeClassifier()

# Run the flow
Exemple #2
0
    X[pos:pos + len(temptrain), ] = temptrain
    y[pos:pos + len(temptrain), ] = tempytrain
    pos += len(temptrain)
    X[pos:pos + len(tempvalid), ] = tempvalid
    y[pos:pos + len(tempvalid), ] = tempyvalid
    pos += len(tempvalid)
    X[pos:pos + len(temptest), ] = temptest
    y[pos:pos + len(temptest), ] = tempytest
    pos += len(temptest)

select = (y[:, 0] == 1) | (y[:, 4] == 1)
X = X[select, :]
y = y[select, :]
y = np.argmax(y, axis=1)
y[y == 4] = 1
encoder = preprocessing.OneHotEncoder(n_values=2)
y = encoder.fit_transform(np.reshape(y, (len(y), 1))).toarray()

with h5py.File(hdf5_file, 'w') as f:
    X_train = f.create_dataset("X_train", (1000, width), compression="gzip")
    X_valid = f.create_dataset("X_valid", (100, width), compression="gzip")
    X_test = f.create_dataset("X_test", (100, width), compression="gzip")
    y_train = f.create_dataset("y_train", (1000, 2), compression="gzip")
    y_valid = f.create_dataset("y_valid", (100, 2), compression="gzip")
    y_test = f.create_dataset("y_test", (100, 2), compression="gzip")

    X_train[:, ] = X[:1000, :]
    X_valid[:, ] = X[1000:1100, :]
    X_test[:, ] = X[1100:1200, :]
    y_train[:, ] = y[:1000, :]
    y_valid[:, ] = y[1000:1100, :]
    def load(self):
        """
        Load this dataset into an undirected heterogeneous graph, downloading it if required.

        The graph has two types of nodes (``user`` and ``movie``) and one type of edge (``rating``).

        The dataset includes some node features on both users and movies: on users, they consist of
        categorical features (``gender`` and ``job``) which are one-hot encoded into binary
        features, and an ``age`` feature that is scaled to have mean = 0 and standard deviation = 1.

        Returns:
            A tuple where the first element is a :class:`StellarGraph` instance containing the graph
            data and features, and the second element is a pandas DataFrame of edges, with columns
            ``user_id``, ``movie_id`` and ``rating`` (a label from 1 to 5).
        """
        self.download()

        ratings, users, movies, *_ = [
            self._resolve_path(path) for path in self.expected_files
        ]

        edges = pd.read_csv(
            ratings,
            sep="\t",
            header=None,
            names=["user_id", "movie_id", "rating", "timestamp"],
            usecols=["user_id", "movie_id", "rating"],
        )

        users = pd.read_csv(
            users,
            sep="|",
            header=None,
            names=["user_id", "age", "gender", "job", "zipcode"],
            usecols=["user_id", "age", "gender", "job"],
        )

        movie_columns = [
            "movie_id",
            "title",
            "release_date",
            "video_release_date",
            "imdb_url",
            # features from here:
            "unknown",
            "action",
            "adventure",
            "animation",
            "childrens",
            "comedy",
            "crime",
            "documentary",
            "drama",
            "fantasy",
            "film_noir",
            "horror",
            "musical",
            "mystery",
            "romance",
            "sci_fi",
            "thriller",
            "war",
            "western",
        ]
        movies = pd.read_csv(
            movies,
            sep="|",
            header=None,
            names=movie_columns,
            usecols=["movie_id"] + movie_columns[5:],
        )

        # manage the IDs
        def u(users):
            return "u_" + users.astype(str)

        def m(movies):
            return "m_" + movies.astype(str)

        users_ids = u(users["user_id"])

        movies["movie_id"] = m(movies["movie_id"])
        movies.set_index("movie_id", inplace=True)

        edges["user_id"] = u(edges["user_id"])
        edges["movie_id"] = m(edges["movie_id"])

        # convert categorical user features to numeric, and normalize age
        feature_encoding = preprocessing.OneHotEncoder(sparse=False)
        onehot = feature_encoding.fit_transform(users[["gender", "job"]])
        scaled_age = preprocessing.scale(users["age"])
        encoded_users = pd.DataFrame(
            onehot, index=users_ids).assign(scaled_age=scaled_age)

        g = StellarGraph(
            {
                "user": encoded_users,
                "movie": movies
            },
            {"rating": edges[["user_id", "movie_id"]]},
            source_column="user_id",
            target_column="movie_id",
        )
        return g, edges
def main():
    """
    Fit models and make predictions.
    We'll use one-hot encoding to transform our categorical features
    into binary features.
    y and X will be numpy array objects.
    """
    
    filename="main_logit_3way" # nam prefix
    model = LogisticRegression(C=0.7, penalty="l2")  # the classifier we'll use
    
    # === load data in memory === #
    print "loading data"
    y, X = load_data('train.csv')
    y_test, X_test = load_data('test.csv', use_labels=False)
    
    X,X_test= Make_3way(X, X_test)# add interractions
    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)


    # if you want to create new features, you'll need to compute them
    # before the encoding, and append them to your dataset after

    #create arrays to hold cv an dtest predictions
    train_stacker=[ 0.0  for k in range (0,(X.shape[0])) ] 

    # === training & metrics === #
    mean_auc = 0.0
    bagging=1 # number of models trained with different seeds
    n = 5  # number of folds in strattified cv
    kfolder=StratifiedKFold(y, n_folds= n,shuffle=True, random_state=SEED)     
    i=0
    for train_index, test_index in kfolder: # for each train and test pair of indices in the kfolder object
        # creaning and validation sets
        X_train, X_cv = X[train_index], X[test_index]
        y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
        #print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it

        # train model and make predictions 
        preds=bagged_set(X_train,y_train,model, SEED , bagging, X_cv, update_seed=True)   
        

        # compute AUC metric for this CV fold
        roc_auc = roc_auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
        
        no=0
        for real_index in test_index:
                 train_stacker[real_index]=(preds[no])
                 no+=1
        i+=1
        

    mean_auc/=n
    print (" Average AUC: %f" % (mean_auc) )
    print (" printing train datasets ")
    printfilcsve(np.array(train_stacker), filename + ".train.csv")          

    # === Predictions === #
    # When making predictions, retrain the model on the whole training set
    preds=bagged_set(X, y,model, SEED, bagging, X_test, update_seed=True)  

    
    #create submission file 
    printfilcsve(np.array(preds), filename+ ".test.csv")  
Exemple #5
0
from sklearn.externals import joblib

# Importing dataset
df = pd.read_csv('Churn_Modelling.csv')
X = df.iloc[:, 3:13].values
y = df.iloc[:, 13].values

# Encoding categorical data
#Encoding gender:
le_gender = preprocessing.LabelEncoder()
X[:, 2] = le_gender.fit_transform(X[:, 2])

# Encoding country: use one-hot encoding to avoid nonsensical averages
le_country = preprocessing.LabelEncoder()
X[:, 1] = le_country.fit_transform(X[:, 1])
ohe_country = preprocessing.OneHotEncoder(categorical_features=[1])
X = ohe_country.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting dataset to train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=0)

# Feature scaling
sc = preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
scaler_filename = input('*Enter filename for scaler to be saved: ') + '.bin'
joblib.dump(sc, open(scaler_filename, 'wb'))

# Training the ANN
Exemple #6
0
    def __init__(self,
                 survey='OGLE3',
                 band='I',
                 use_time=True,
                 use_err=True,
                 norm=True,
                 folded=True,
                 machine='Jorges-MBP',
                 seq_len=600,
                 phy_params='',
                 subsample=False):
        """
        Parameters
        ----------
        survey     : str
            Name of survey to be used (only OGLE3 available for now)
        band       : str
            Name of passband for a given survey name 
            (OGLE3 uses I-band light curves for now)
        use_time   : bool, optional
            return light curves with time or not
        use_err    : bool, optional
            return light curves with error measurements or not
        norm       : bool, optional
            normalize light curves or not
        folded     : bool, optional
            use folded light curves or not
        machine    : bool, optional
            which machine is been used (colab, exalearn, local)
        seq_len    : bool, optional
            length of the light curves to be used
        phy_params : bool, optional
            which physical parameters will be provided with the loader
        subsample  : bool, optional
            wheather to subsample the entire dataset
        """

        if machine == 'Jorges-MBP':
            root = local_root
        elif machine == 'colab':
            root = colab_root
        elif machine == 'exalearn':
            root = exalearn_root
        else:
            print('Wrong machine, please select loca, colab or exalearn')
            sys.exit()
        if not folded:
            data_path = ('%s/time_series/real' % (root) +
                         '/%s_lcs_%s_meta_snr5_augmented_trim%i.pkl' %
                         (survey, band, seq_len))
        else:
            data_path = (
                '%s/time_series/real' % (root) +
                '/%s_lcs_%s_meta_snr5_augmented_folded_trim%i.npy.gz' %
                (survey, band, seq_len))
        print('Loading from:\n', data_path)
        with gzip.open(data_path, 'rb') as f:
            self.aux = np.load(f, allow_pickle=True)
        self.lcs = self.aux.item()['lcs']
        self.meta = self.aux.item()['meta']
        del self.aux
        if subsample:
            idx = np.random.randint(0, self.lcs.shape[0], 20000)
            self.lcs = self.lcs[idx]
            self.meta = self.meta.iloc[idx].reset_index(drop=True)
        self.labels = self.meta['Type'].values
        ## integer encoding of labels
        self.label_int_enc = preprocessing.LabelEncoder()
        self.label_int_enc.fit(self.labels)
        self.labels_int = self.label_int_enc.transform(self.labels)
        ## one-hot encoding of labels
        self.label_onehot_enc = preprocessing.OneHotEncoder(sparse=False,
                                                            categories='auto',
                                                            dtype=np.float32)
        self.label_onehot_enc.fit(self.labels.reshape(-1, 1))
        self.labels_onehot = self.label_onehot_enc.transform(
            self.labels.reshape(-1, 1))

        if use_time and not use_err:
            self.lcs = self.lcs[:, :, 0:2]
        if not use_time and not use_err:
            self.lcs = self.lcs[:, :, 1:2]

        if not 'folded' in data_path:
            self.lcs = return_dt(self.lcs)
        if norm:
            self.lcs = normalize_each(self.lcs,
                                      n_feat=self.lcs.shape[2],
                                      scale_to=[.0001, .9999],
                                      norm_time=use_time)

        self.phy_names = []
        if len(phy_params) > 0:
            if 'p' in phy_params or 'P' in phy_params:
                self.phy_names.append('Period')
            if 't' in phy_params or 'T' in phy_params:
                self.phy_names.append('teff_val')
            if 'm' in phy_params or 'M' in phy_params:
                self.phy_names.append('[Fe/H]_J95')
            if 'c' in phy_params or 'C' in phy_params:
                self.phy_names.append('bp_rp')
            if 'a' in phy_params or 'A' in phy_params:
                self.phy_names.append('abs_Gmag')
            if 'r' in phy_params or 'R' in phy_params:
                self.phy_names.append('radius_val')
            if 'l' in phy_params or 'L' in phy_params:
                self.phy_names.append('lum_val')
            self.phy_aux = self.phy_names
        else:
            self.phy_aux = ['Period']

        self.mm_scaler = preprocessing.MinMaxScaler()
        self.mm_scaler.fit(self.meta.loc[:, self.phy_aux].values.astype(
            np.float32))
        self.meta_p = self.mm_scaler.transform(
            self.meta.loc[:, self.phy_aux].values.astype(np.float32))
Exemple #7
0
def pre_processing_train(train_data, test_data):
    X_train = train_data.loc[:, train_data.columns != 'SalePrice']
    X_test = test_data.loc[:, test_data.columns != 'SalePrice']

    # In[9]:

    X_combined = X_train.append(X_test, ignore_index=True)
    X_combined.shape

    # In[10]:

    def nulls(X):
        null_train = X.isnull().sum()
        null_train = null_train[null_train > 0]
        return null_train

    # In[11]:

    null_combined = nulls(X_combined)

    # In[12]:

    def dropColumns(X, nulls):
        for i in np.arange(len(nulls)):
            if nulls.values[i] > .5 * len(X):
                X = X.drop([nulls.index[i]], axis=1, inplace=False)
        return X

    # In[13]:

    X_combined = dropColumns(X_combined, null_combined)
    null_combined = nulls(X_combined)

    # In[14]:

    def impute(X, nulls):
        for i in nulls.index:
            #     print(str(data[i].dtype.name) + " " + str(i))
            #     impute_mode = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
            #     impute_mean = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
            if X[i].nunique() < 50:
                X[i] = X[i].fillna(X[i].mode()[0])
            else:
                X[i] = X[i].fillna(X[i].mean())
        return X

    # In[16]:

    X_combined = impute(X_combined, null_combined)
    X_combined.isnull().sum()

    # In[17]:

    def get_objectIndices(X):
        objectIndices = []
        for column in X:
            if X[column].nunique() < 50:
                objectIndices.append(X.columns.get_loc(column))
        return objectIndices

    def get_numericIndices(X):
        numericIndices = []
        for column in X:
            if X[column].nunique() >= 50:
                numericIndices.append(X.columns.get_loc(column))
        return numericIndices

    def get_numericColumnName(X):
        numericColumnName = []
        for column in X:
            if X[column].nunique() >= 50:
                numericColumnName.append(column)
        return numericColumnName

    # In[19]:

    # In[20]:
    # def remove_num_corr(X):
    #     numericColumnName = get_numericColumnName(X)
    #     numeric_combined = X.loc[:, numericColumnName]
    #     numeric_combined_corr = numeric_combined.corr()
    #     for i in numeric_combined_corr:
    #         numericCorrCount = numeric_combined_corr[i].where(lambda x: abs(x) >= .25).count()
    #         if numericCorrCount > 5:
    #             X = X.drop(i, axis=1, inplace=False)
    #     return X
    #
    # X_combined = remove_num_corr(X_combined)

    numericColumnName = get_numericColumnName(X_combined)
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_combined[numericColumnName])
    X_combined[numericColumnName] = scaler.transform(
        X_combined[numericColumnName])
    # scaler = preprocessing.MinMaxScaler()
    # scaler.fit(X_combined[numericColumnName])
    # X_combined[numericColumnName] = scaler.transform(X_combined[numericColumnName])
    # scaler = preprocessing.RobustScaler()
    # scaler.fit(X_combined[numericColumnName])
    # X_combined[numericColumnName] = scaler.transform(X_combined[numericColumnName])

    # In[22]:

    # In[23]:

    # pca = PCA(.9)
    # pca.fit(X_combined[numericColumnName])
    # a = pca.transform(X_combined[numericColumnName])
    # a.shape
    # X_combined = X_combined.drop(numericColumnName, axis=1, inplace=False)

    objectIndices = get_objectIndices(X_combined)
    le = preprocessing.LabelEncoder()
    X_combined = X_combined.apply(le.fit_transform)
    onehotencoder = preprocessing.OneHotEncoder(
        categorical_features=objectIndices)
    X_combined = onehotencoder.fit_transform(X_combined).toarray()

    # X_combined = np.concatenate((X_combined, a), axis=1)

    return X_combined
def one_hot_transform(formalarray,input_label):#把原始编码转换成One-hot编码
    enc=pre.OneHotEncoder()
    enc.fit(formalarray)
    return enc.transform(input_label).toarray()
        freq_of_common_class[cls] = freq_of_common_class.get(cls, 0) + 1
###print("\nThe Frequency of occurrence of the common classes in the testing data set", freq_of_each_class)
# Pick 3 Classes with the most number of images from the common Classes
counts = nlargest(3, freq_of_common_class.values())
classes_to_be_considered = [
    key for key, value in freq_of_common_class.items() if value in counts
]
print("The 3 Common Classes have a total number of images: ", sum(counts))
print("The 3 Common Classes that have the highest number of images are: ",
      classes_to_be_considered)
###print("\nClasses that will be considered: ", classes_to_be_considered)
# Transform labels from a list of strings to a list of Numbers
numerical_form_classes = np.asarray(
    [classes_to_be_considered.index(t) for t in classes_to_be_considered])
###print("\nClasses that will be considered in Numerical form: ", numerical_form_classes)
classes_onehot = preprocessing.OneHotEncoder(sparse=False).fit_transform(
    numerical_form_classes.reshape(-1, 1))
###print("One hot vector of Classes that will be considered is", classes_onehot)
# Dictionary with Class name and Corresponding label
corres_label = dict(zip(classes_to_be_considered, classes_onehot))
###print("\nDictionary with Class name and Corresponding label", corres_label)

# The total number of images that will be considered is:

# Build the model's Training Data set
training_data = []
readable_training_data = []
for folder in training_classes:
    if folder in classes_to_be_considered:
        path = TRAIN_DIR + folder
        files = os.listdir(path)
        # print(files)
Exemple #10
0
    X_label = []
    X_label1 = []
    for i in range(0, len(my_data[0])):
        if i in unique_list:
            for j in range(0, len(unique_list[i])):
                X_label.append(unique_list[i][j] + str(i))
        else:
            X_label1.append(my_data[0][i] + str(i))
    i = 0
    print X_label1
    for i in range(0, len(X_label1)):
        X_label.append(X_label1[i])

    X = [X_label]

    print len(X)

    enc = preprocessing.OneHotEncoder(categorical_features=categories)
    enc.fit(encoder)
    for row in my_data[1:]:
        X_small = enc.transform(encode(row)).toarray()
        X.append(X_small[0].tolist())

    #myfile = open("processed_data_new_v2.csv",'wb')
    #wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    #for row in X:
    #	wr.writerow(row)

    building_model_accuracy(X)
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../inputs/train-folds.csv")

    # list of all numerical columns
    num_cols = [
        "age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week"
    ]

    # drop the numerical columns for simplicity
    df = df.drop(num_cols, axis=1)

    # remove white-spacing from the values of income column
    df["income"] = df.income.str.strip()

    # map targets to 0s and 1s, .
    target_mapping = {"<=50K": 0, ">50K": 1}
    df.loc[:, "income"] = df.income.map(target_mapping)

    # all the categorical features except income & kfold
    features = [x for x in df.columns if x not in ("kfold", "income")]

    # handling NaN values, note that converting all columns to "strings"
    # it doesnt matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # training dataset
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # vaidation dataset
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initailize the OneHotEncoding from scikit-learn module
    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features
    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)
    ohe.fit(full_data[features])

    # get training data using folds
    x_train = ohe.transform(df_train[features])

    # get validation data using folds
    x_valid = ohe.transform(df_valid[features])

    # initalize xgboost model
    model = xg.XGBClassifier(max_depth=7, n_estimators=200, n_jobs=-1)

    # fit model on training data
    model.fit(x_train, df_train.income.values)

    # predict the probability to get 1s, need to predict
    # probability values as we are calculating AUC
    yhat_ones = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, yhat_ones)

    fpr, tpr, threshold = metrics.roc_curve(yhat_ones, df_valid.income.values)

    # print auc at each fold
    print(f"Fold = {fold}, AUC = {auc}")
Exemple #12
0
 def _one_hot(self):
     self.ohe = preprocessing.OneHotEncoder()
     self.ohe.fit(self.df[self.cat_feats].values)
     return self.ohe.transform(self.df[self.cat_feats].values)
Exemple #13
0
dataset = dataset.sort_values(by=[' Start time'])

X = dataset.iloc[:, 5:].values
Y = (dataset[' Event Name'])

# deal with nan padding
X = np.nan_to_num(X)
# min-max scale X
mmScaler = preprocessing.MinMaxScaler()
X = mmScaler.fit_transform(X)
# X = X.reshape((-1, 8, 279))  # take every eight row (8 channels) as a sample

# lb = preprocessing.LabelBinarizer()
# Y = lb.fit_transform(np.expand_dims(Y, axis=1))

encoder =preprocessing.OneHotEncoder(categories='auto')
Y = encoder.fit_transform(np.expand_dims(np.asarray(Y), axis=1)).toarray()

# Y = Y[0:-1:8]  # take every eight row as a label


# separate training and test_result set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=3)

#  Using SMOTE oversampling ######################
# X_train = X_train.reshape((len(X_train), -1))  # reshape x so that it can be resampled
# X_train, Y_train = smote.fit_sample(X_train, Y_train)
# X_train = X_train.reshape((len(X_train), 8, -1))  # reshape x back into 8 channel format

# Using Class Weighing ##########################
# classWeight = compute_class_weight('balanced', np.unique(Y), Y)
Exemple #14
0
        i[2]='1'

#transform test data
test_cat1 = le1.transform([i[0] for i in test])
test_cat2 = le2.transform([i[1] for i in test])
test_cat3 = le3.transform([i[2] for i in test])
test_cat4 = le4.transform([i[3] for i in test])
test_cat5 = le5.transform([i[4] for i in test])
test_cat6 = le6.transform([i[5] for i in test])
test_cat7 = le7.transform([i[6] for i in test])
test_cat8 = le8.transform([i[7] for i in test])
test_cat9 = le9.transform([i[8] for i in test])
test_cat = [[test_cat1[i],test_cat2[i],test_cat3[i],test_cat4[i],test_cat5[i],test_cat6[i],test_cat7[i],test_cat8[i],test_cat9[i]] for i in range(len(test_cat1))]

#create dummy vars
enc = preprocessing.OneHotEncoder(sparse=True)
enc.fit(X_cat)
X = enc.transform(X_cat)
test = enc.transform(test_cat)

#don't need stores with 0 sales
X = X[Y>0]
Y = Y[Y>0]

#log transform sales
Y = np.log(Y)

#Do some cross val testing
kf = KFold(np.shape(X)[0], n_folds=5)
i=0
rmspe=[]
Exemple #15
0
# In[148]:

test = test.dropna(axis=0)

# In[149]:

test.shape

# In[234]:

features = train['Sex']
enc = preprocessing.LabelEncoder()
enc.fit(features)
features = enc.transform(features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(features.reshape(-1, 1))
features = encoded.transform(features.reshape(-1, 1)).toarray()

# In[151]:

features1 = train['Embarked']
enc = preprocessing.LabelEncoder()
enc.fit(features1)
features1 = enc.transform(features1)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(features1.reshape(-1, 1))
features1 = encoded.transform(features1.reshape(-1, 1)).toarray()

# In[235]:
        if 'classification' in datasets[datasetname]['probtype']:
            with open(datasets[datasetname]['filepath'], 'rb') as fl:
                df = pk.load(fl)

            # check that there are no missing values
            assert (np.all(np.logical_not(df.isna()))), 'Nan values present'
            ycols = datasets[datasetname]['targets']
            xcolsnum = list(
                set(df.select_dtypes([np.number]).columns) - set(ycols))
            xcolsnonnum = list(
                set(df.select_dtypes([object]).columns) - set(ycols))

            if len(xcolsnonnum) > 0:
                # one-hot encoding of any categorical variables
                Xnonnum = df.loc[:, xcolsnonnum].values
                ohe = pp.OneHotEncoder(sparse=False, drop='first')
                ohe.fit(Xnonnum)
                XnonnumOhe = ohe.transform(Xnonnum)

                # check that the excluded variable is the first variable
                excluded = ohe.categories_[0][0]
                idx = XnonnumOhe.sum(
                    axis=1
                ) == 0  # find all rows that don't fit in another category
                assert np.all(Xnonnum[idx] == excluded)

            # concatenate to arrive at final arrays
            xcols = xcolsnum + xcolsnonnum
            X = df.loc[:, xcolsnum].values
            y = np.ravel(df.loc[:, ycols].values)
            if len(xcolsnonnum) > 0:
Exemple #17
0
import numpy as np
from sklearn import preprocessing

# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OntHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print(size in bytes for dense array)
print(f"Size of dense array: {ohe_example.nbytes}")

# initialize OntHotEncoder from scikit-learn
# keep sparse = True to get dense array
ohe = preprocessing.OneHotEncoder(sparse=True)

# fit and tranform data with sparse one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size of this sparse matrix
print(f"Size of dense array: {ohe_example.data.nbytes}")

full_size = (ohe_example.data.nbytes + ohe_example.indptr.nbytes +
             ohe_example.indices.nbytes)

# print full size of this sparse matrix
print(f'Full size of sparse array: {full_size}')
CalleEstado = np.array(dataframe.CalleEstado.values)
#print(CalleEstado)

subset = dataframe[["Dia", "TipoCalle", "Iluminacion", "Clima", "CalleEstado"]]
#print(subset)
valores = np.array(subset.values)
#print(valores)

diasCate = [1, 2]
TipoCalleCate = [1, 2, 3, 4, 5]
IluminacionCate = [1, 2, 3, 4]
ClimaCate = [0, 1, 2, 3, 4]
CalleEstadoCate = [0, 1, 2, 3, 4]

enc = preprocessing.OneHotEncoder(categories=[
    diasCate, TipoCalleCate, IluminacionCate, ClimaCate, CalleEstadoCate
])
fit = enc.fit(valores)

arreglo = enc.transform(valores).toarray()

OHE = pd.DataFrame({
    'DiaFinde': arreglo[:, 0],
    'DiaLaboral': arreglo[:, 1],
    "Autopista": arreglo[:, 2],
    "AutopistaDoble": arreglo[:, 3],
    "1Via": arreglo[:, 4],
    "Redondel": arreglo[:, 5],
    "Entrada": arreglo[:, 6],
    "LuzDia": arreglo[:, 7],
    "LuzNoche": arreglo[:, 8],
#type cast features
features_to_cast = ['MSSubClass']
utils.cast_to_cat(house_train, features_to_cast)

#manual feature selection
features_to_drop = ['Id', 'SalePrice']
missing_features_above_th = utils.get_features_to_drop_on_missingdata(
    house_train, 0.25)
features_to_drop.extend(missing_features_above_th)
house_train1 = utils.drop_features(house_train, features_to_drop)
house_train1.info()

#build pipeline for categorical features
categorical_pipeline = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy="most_frequent")),
    ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#build pipeline for numerical features
numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()),
                                        ('scaler',
                                         preprocessing.StandardScaler())])

#build preprocessing pipeline for all features
cat_features = utils.get_non_continuous_features(house_train1)
num_features = utils.get_continuous_features(house_train1)

preprocess_pipeline = compose.ColumnTransformer([
    ('cat', categorical_pipeline, cat_features),
    ('num', numerical_pipeline, num_features)
])
    def Initialize(self, trainFileName, devFileName, testFileName):
        trainList = []
        trainResult = []
        self.testFeatures = []
        self.devFeatures = []
        self.trainFeatures = []
        self.train = []
        #self.dev = []
        #self.test = []
        self.devResult = []
        self.rawResult = []

        print "train feature processing..."
        with open(trainFileName) as trainFile:
            for line in trainFile:
                line = line.decode('utf-8').strip()
                if not line:
                    continue
                space = line.find(" ")
                if space < 5:
                    continue
                answer, train = line[:space].upper(), line[space + 1:]
                li, ans = self.lineProc(train, answer, True)
                trainList += li
                trainResult += ans
                self.trainFeatures.append(li)
                self.rawResult.append(self.languages[answer])

        with open(devFileName) as devFile:
            for line in devFile:
                line = line.decode('utf-8').strip()
                if not line:
                    continue
                space = line.find(" ")
                if space < 5:
                    continue
                answer, train = line[:space].upper(), line[space + 1:]
                li = self.lineProc(train, answer, False)
                self.devFeatures.append(li)
                self.devResult.append(self.languages[answer])

        with open(testFileName) as testFile:
            for line in testFile:
                if not line:
                    continue
                line = line.decode('latin-1').strip()
                test = self.lineProc(line, "", False)
                self.testFeatures.append(test)

        trainList, trainResult = self.FisherYatesShuffle(
            trainList, trainResult)
        trainResult = np.array(trainResult)
        self.trainResult = self.answerLables.fit_transform(trainResult)

        self.trainLabels = preprocessing.LabelEncoder()
        featureList = list(self.c)

        self.trainLabels.fit(featureList)
        #print self.trainLabels.classes_
        length = len(self.c)
        print "feature length:", length
        self.v = preprocessing.OneHotEncoder(n_values=length)

        trainList = np.array(trainList)
        self.train = self.trainLabels.transform(
            trainList.ravel()).reshape(*trainList.shape)

        self.train = self.v.fit_transform(self.train).toarray()
        print "train shape", self.train.shape
Exemple #21
0
test = test.drop('Trip_ID', axis=1)
train = train.drop(['Trip_ID', 'Surge_Pricing_Type'], axis=1)

for f in cat_cols:
    print(f)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[f].values) + list(test[f].values))
    train[f] = lbl.transform(list(train[f].values))
    test[f] = lbl.transform(list(test[f].values))

train = train.fillna(0).values
test = test.fillna(0).values

y = y - 1

ohe = preprocessing.OneHotEncoder(categorical_features=[1, 4, 5, 11],
                                  sparse=False)
ohe.fit(train)
train = ohe.transform(train)
test = ohe.transform(test)

scl = preprocessing.StandardScaler()
train = scl.fit_transform(train)
test = scl.transform(test)

y_enc = np_utils.to_categorical(y)


def nn_model():
    model = Sequential()
    print('Build model...')
def one_hot_encoding():
    encoder = preprocessing.OneHotEncoder(categories='auto')
    encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
    encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
    print("\nEncoded vector =", encoded_vector)
Exemple #23
0
cat_imputer = CategoricalImputer()
cat_imputer.fit(titanic_train['Embarked'])
print(cat_imputer.fill_)
titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked'])

encodable_columns = ['Sex', 'Embarked', 'Pclass']
feature_defs = [(col_name, preprocessing.LabelEncoder())
                for col_name in encodable_columns]
mapper = DataFrameMapper(feature_defs)
mapper.fit(titanic_train)
titanic_train[encodable_columns] = mapper.transform(titanic_train)

titanic_train1 = titanic_train.drop(
    ['PassengerId', 'Name', 'Cabin', 'Ticket', 'Survived'], axis=1)

one_hot_encoder = preprocessing.OneHotEncoder(
    categorical_features=np.array([0, 1, 6]))
one_hot_encoder.fit(titanic_train1)
print(one_hot_encoder.n_values_)
X_train = one_hot_encoder.transform(titanic_train1).toarray()
y_train = titanic_train[['Survived']]

dt_estimator = tree.DecisionTreeClassifier(random_state=100)
dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 4, 5, 6, 7, 8]}
grid_dt_estimator = model_selection.GridSearchCV(dt_estimator,
                                                 dt_grid,
                                                 cv=10,
                                                 refit='True',
                                                 return_train_score=True)
grid_dt_estimator.fit(X_train, y_train)

print(grid_dt_estimator.best_estimator_)
Exemple #24
0
for code_table in code_tables:
    size = len(code_tables)#每一列有多少个不同的值
    sortcode_table = sorted(code_table.keys())#每一列不同数字,从小到大排序
    for key,val in enumerate(sortcode_table):
        print(key,val)
    #     code_table[val] = np.zeros(shape=size)#创建多少个0
    #     code_table[val][key] = 1

#按字典编码
ohe_samples = []
for row in raw_samples:
    ohe_sample = np.array([],dtype=int)
    for key,val in enumerate(row):
        ohe_sample = np.hstack(
            ohe_sample,code_tables[key][val]
        )#水平拼接
        ohe_samples.append(ohe_sample)

#独热编码
one = sp.OneHotEncoder(sparse=True,dtype='int')
ohe_samples = one.fit_transform(raw_samples)

print(ohe_samples)
new_samples = np.array([#用已有字典去模拟,如果列内出现字典里没有的编码,结果将会出错
    [7,8,9],
    [2,5,2],
])
ohe_samples2 = one.transform(new_samples)
print(ohe_samples)

Exemple #25
0
 def __init__(self):
     self.std_scaler = preprocessing.StandardScaler()
     self.oht_scaler = preprocessing.OneHotEncoder()
     self.std_scaled = False
     self.oht_scaled = False
Exemple #26
0
    # numeric
    ('numeric_variables_processing',
     pipeline.Pipeline(
         steps=[('selecting',
                 preprocessing.FunctionTransformer(
                     lambda data: data[:, numeric_data_indices])
                 ), ('scaling', preprocessing.StandardScaler(with_mean=0))])),

    # categorical
    ('categorical_variables_processing',
     pipeline.Pipeline(
         steps=[('selecting',
                 preprocessing.FunctionTransformer(
                     lambda data: data[:, categorical_data_indices])),
                ('hot_encoding',
                 preprocessing.OneHotEncoder(handle_unknown='ignore'))])),
]

#SGDRegressor

regressor = linear_model.Lasso(max_iter=2000)

estimator = pipeline.Pipeline(steps=[(
    'feature_processing',
    pipeline.FeatureUnion(
        transformer_list=transformer_list)), ('model_fitting', regressor)])

estimator.fit(train_data, train_labels)
predicted = estimator.predict(test_data)

print("RMSLE: ", rmsle(test_labels, predicted))
Exemple #27
0
import tensorflow as tf
import numpy as np
import pandas as pd
import fashion_data_import as fin
from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()  # creating new encoder object

# logpath
LOG_PATH = '/home/wataru/machineLearn/kaggle/TF_practice/estimator/customEstimator/log'

# importing the fashion MNIST data from the script as numpy arrays
Xtrain, ytrain = fin.data_in('train')
Xtest, ytest = fin.data_in('test')
num_labels = np.unique(ytrain).size

# one hot matrix for labels
enc.fit(ytrain)
ytrain = enc.transform(ytrain).toarray()

enc.fit(ytest)
ytest = enc.transform(ytest).toarray()

# parameters
learning_rate = 0.001
batchSize = 500
LOGDIR = '/home/wataru/machineLearn/kaggle/TF_practice/estimator/customEstimator/log2'
num_iter = 50000


def model_fn(features, labels, mode, params):
Exemple #28
0
# Preprocess categorical labels

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
import pandas as pd

raw_data = {
    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
    'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
    'age': [42, 52, 36, 24, 73],
    'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']
}
df = pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city'])
df

# Create dummy variables for every unique category in df.city
pd.get_dummies(df["city"])

# Convert strings categorical names to integers
integerized_data = preprocessing.LabelEncoder().fit_transform(df["city"])

# View data
integerized_data

# Convert integer categorical representations to OneHot encodings
preprocessing.OneHotEncoder().fit_transform(integerized_data.reshape(
    -1, 1)).toarray()
Exemple #29
0
# TODO: create a LabelEncoder object and fit it to each feature in X

# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()

# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
X_2 = X.apply(le.fit_transform)
print("head:", X_2.head())
print("shape of X2 after transform:", X_2)
print("classes", le.classes_)
# TODO: create a OneHotEncoder object, and fit it to all of X

# 1. INSTANTIATE
enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit(X_2)

print("shape fitttttt:", enc.fit(X_2))
# 3. Transform
onehotlabels = enc.transform(X_2).toarray()
print("shape:", onehotlabels.shape)

print(onehotlabels)


# as you can see, you've the same number of rows 891
# but now you've so many more columns due to how we changed all the categorical data into numerical data
def getLabelEncoder(values1):
Exemple #30
0
 def _one_hot_encoding(self):
     one_hot_encoders = preprocessing.OneHotEncoder()
     one_hot_encoders.fit(self.df[self.cat_feats].values)
     return one_hot_encoders.transform(self.df[self.cat_feats].values)