Example #1
0
def hash_encoder(train, test, cols_encode, target=None, n_features=10):
    """hash encoder"""
    h = FeatureHasher(n_features=n_features, input_type="string")
    for col_encode in cols_encode:
        h.fit(train[col_encode])
        train_hash = h.transform(train[col_encode])
        test_hash = h.transform(test[col_encode])
    return train_hash, test_hash
def feat_hash(train_data, test_data, feats):

    print('\n', type(train_data[5]))
    hashed = FeatureHasher(n_features=feats, input_type='string')
    hashed.fit(train_data)
    trans_train = hashed.transform(train_data)
    trans_test = hashed.transform(test_data)
    return trans_train, trans_test
Example #3
0
class FeatureHash:
    def __init__(self, max_feature_num=400, input_data_type='string'):
        self.feature_hash = FeatureHasher(n_features=max_feature_num,
                                          input_type=input_data_type)

    def get_feature_set(self, train_data):
        return self.feature_hash.transform(
            preprocess.tokenize_string_list(train_data, " ")).todense()

    def fit_feature_model(self, train_data):
        self.feature_hash.fit(preprocess.tokenize_string_list(train_data, " "))

    def get_feature_model(self):
        return self.feature_hash
Example #4
0
class FeatureHash(FeatureExtractionInterface):
    def __init__(self, max_feature_num=5000, input_data_type='string'):
        self.feature_hash = FeatureHasher(n_features=max_feature_num,
                                          input_type=input_data_type)

    def get_feature_set(self, train_data):
        return self.feature_hash.transform(train_data)

    def fit_feature_model(self, train_data):
        self.feature_hash.fit(train_data)

    def save_feature_model(self, filepath):
        classificationutils.save_classifier(filepath, self.feature_hash)

    def load_feature_model(self, filepath):
        return classificationutils.load_classifier(filepath)

    def get_feature_model(self):
        return self.feature_hash
Example #5
0
class DFFeatureHasher(TransformerMixin):
    # FeatureHasher but for pandas DataFrames
    def __init__(self, n_features=1048576, input_type='string'):
        self.n_features = n_features
        self.input_type = input_type
        self.hasher = None

    def fit(self, X, y=None):
        self.hasher = FeatureHasher(n_features=self.n_features,
                                    input_type=self.input_type)
        self.hasher.fit(np.array(X))
        return self

    def transform(self, X):
        Xhasher = self.hasher.transform(np.array(X))
        Xhashed = pd.DataFrame(Xhasher.toarray(),
                               index=X.index,
                               columns=[
                                   f'{X.columns[0]}_hash_{x}'
                                   for x in range(Xhasher.shape[1])
                               ])
        return Xhashed
def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    with pytest.raises(TypeError):
        hasher.fit()
Example #7
0
def load_data_v1(data_path):
    attr_name = [
        'taxi_id', 'point', 'duration', 'time', 'duration', 'distance'
    ]
    # 训练集数据
    train = pd.read_csv(os.path.join(data_path, 'train.txt'), header=None)
    train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6]]
    dataset = train.values[:, [0, 1, 2, 3, 4, 5]]
    print(train_set[0])

    # 测试集数据
    test = pd.read_csv(os.path.join(data_path, 'test.txt'), header=None)
    test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6]]
    print(test_set[0])

    # 测试集中除去最后一列数据存放于列表中,以出租车ID为主键
    samples = list()
    for sample in dataset:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            sample_dict[attr_name[index]] = attr
        samples.append(sample_dict)

    h = FeatureHasher(n_features=2048)
    h.fit(samples)

    # 训练集数据转换成x,y列表
    x_train = list()
    y_train = list()
    for sample in train_set:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            attr = str(attr)
            if index == 6:
                y_train.append(int(attr))
                continue
            sample_dict[attr_name[index]] = attr
        x_train.append(sample_dict)

    # 测试集数据转换成x,y列表
    x_test = list()
    y_test = list()
    for sample in test_set:
        sample_dict = dict()
        for index, attr in enumerate(sample):
            attr = str(attr)
            if index == 6:
                y_test.append(int(attr))
                continue
            sample_dict[attr_name[index]] = attr
        x_test.append(sample_dict)

    x_train = h.transform(x_train).toarray()
    x_test = h.transform(x_test).toarray()
    print(x_train[0])
    print(x_test[0])
    print(x_train.shape)
    print(x_test.shape)

    y_train = np.asarray(y_train, dtype='int16')
    y_test = np.asarray(y_test, dtype='int16')

    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test, nb_classes)
    print(y_train.shape)
    print(y_test.shape)

    # return x_train, y_train, x_dev, y_dev, x_test
    return x_train, y_train, x_test, y_test, x_test
Example #8
0
test = pd.read_csv("test.txt")
test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]
print(test_set[0])

dataset = train.values[:, [0, 1, 2, 3, 4, 5, 6, 7]]

samples = list()

for sample in dataset:
    sample_dict = dict()
    for index, attr in enumerate(sample):
        sample_dict[attr_name[index]] = attr
    samples.append(sample_dict)

h.fit(samples)

X_train = list()
y_train = list()

X_test = list()
y_test = list()
for sample in train_set:
    sample_dict = dict()
    for index, attr in enumerate(sample):
        attr = str(attr)
        if index == 8:
            y_train.append(int(attr))
            continue
        sample_dict[attr_name[index]] = attr
    X_train.append(sample_dict)
Example #9
0
import pandas as pd
from hashing import HashingEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error 
from sklearn.ensemble import RandomForestRegressor



dfbgisnlp = pd.read_csv("./Final/DataPre/bgis_vendorPre_words")
dfbgiscost = pd.read_csv("./Final/Data/BGIS_Vendor_scaled1hot.csv")
#Removing Outliers
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=10, input_type='string')
desc = dfbgisnlp["descriptions"]
#len(dfbgisnlp)
result=fh.fit(desc)
resultin = []
for item in range(len(desc)):
    #resultin.append(result.transform(item))
    #resultin.append(result.toarray())
    print(desc)
df = pd.DataFrame(result.toarray(), columns=['fh1', 'fh2', 'fh3', 'fh4', 'fh5', 'fh6', 'fh7', 'fh8','fh9','fh10'])
from sklearn.datasets import load_boston
dfboston = load_boston()
import pandas as pd
from sklearn.datasets import load_boston
bunch = load_boston()
y = dfbgiscost["Func_Burdened_Cost"]
X = dfbgisnlp 
enc = HashingEncoder(cols=['descriptions']).fit(X, y)
numeric_dataset = enc.transform(X)
Example #10
0
plt.plot(numberFeatures.iloc[:,4])


def plotCorr(data,number):
    subData = data.iloc[number*1000:(number+1)*1000,:]
    corrY = subData.corr()
    plt.figure()
    plt.imshow(corrY)

#plotCorr(numberFeatures,1)
#plotCorr(numberFeatures,2)

#%%
from sklearn.feature_extraction import FeatureHasher
hasher = FeatureHasher(input_type='string',dtype='float')
strFeature = hasher.fit(stringFeature).transform(stringFeature).toarray()

#%% normalizetoin
numberFeatures = numberFeatures.fillna(0)

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(numberFeatures.iloc[:,1:])
X_scaled = scaler.transform(numberFeatures.iloc[:,1:])
import numpy as np
#a = numpy.asarray([ [1,2,3], [4,5,6], [7,8,9] ])
#numpy.savetxt("/Users/weizhi/Downloads/kaggle competion/scared.csv", X_scaled, delimiter=",")

where_are_NaNs = np.isnan(X_scaled)
X_scaled[where_are_NaNs] = 0
#data = pd.read_csv("/Users/weizhi/Downloads/kaggle competion/scared.csv")
numberFeatures = None
Example #11
0
y = pd.read_csv(y_url)
y = y.pop('status_group').values

# initialize data sets
x = drop_and_fill_na(x)
x_prob = drop_and_fill_na(x_prob)

# Fit encoders and transform x
oh_enc = OneHotEncoder(handle_unknown='ignore')
oh_enc.fit(x[cols_to_one_hot])

y_oh_enc = OneHotEncoder(handle_unknown='ignore')

funder_feat_hasher = FeatureHasher(n_features=FEATURE_HASHER_NUM_FEATURES, input_type='string', alternate_sign=False)
funder_feat_hasher.fit(x['funder'])

installer_feat_hasher = FeatureHasher(n_features=FEATURE_HASHER_NUM_FEATURES, input_type='string', alternate_sign=False)
installer_feat_hasher.fit(x['installer'])

x = transform_data(x)
x_prob = transform_data(x_prob)
y = y_oh_enc.fit_transform(y.reshape(-1,1)).toarray()

# split into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 27)
num_labels = 3
input_size = x_train.shape[1]
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')