def hash_encoder(train, test, cols_encode, target=None, n_features=10): """hash encoder""" h = FeatureHasher(n_features=n_features, input_type="string") for col_encode in cols_encode: h.fit(train[col_encode]) train_hash = h.transform(train[col_encode]) test_hash = h.transform(test[col_encode]) return train_hash, test_hash
def feat_hash(train_data, test_data, feats): print('\n', type(train_data[5])) hashed = FeatureHasher(n_features=feats, input_type='string') hashed.fit(train_data) trans_train = hashed.transform(train_data) trans_test = hashed.transform(test_data) return trans_train, trans_test
class FeatureHash: def __init__(self, max_feature_num=400, input_data_type='string'): self.feature_hash = FeatureHasher(n_features=max_feature_num, input_type=input_data_type) def get_feature_set(self, train_data): return self.feature_hash.transform( preprocess.tokenize_string_list(train_data, " ")).todense() def fit_feature_model(self, train_data): self.feature_hash.fit(preprocess.tokenize_string_list(train_data, " ")) def get_feature_model(self): return self.feature_hash
class FeatureHash(FeatureExtractionInterface): def __init__(self, max_feature_num=5000, input_data_type='string'): self.feature_hash = FeatureHasher(n_features=max_feature_num, input_type=input_data_type) def get_feature_set(self, train_data): return self.feature_hash.transform(train_data) def fit_feature_model(self, train_data): self.feature_hash.fit(train_data) def save_feature_model(self, filepath): classificationutils.save_classifier(filepath, self.feature_hash) def load_feature_model(self, filepath): return classificationutils.load_classifier(filepath) def get_feature_model(self): return self.feature_hash
class DFFeatureHasher(TransformerMixin): # FeatureHasher but for pandas DataFrames def __init__(self, n_features=1048576, input_type='string'): self.n_features = n_features self.input_type = input_type self.hasher = None def fit(self, X, y=None): self.hasher = FeatureHasher(n_features=self.n_features, input_type=self.input_type) self.hasher.fit(np.array(X)) return self def transform(self, X): Xhasher = self.hasher.transform(np.array(X)) Xhashed = pd.DataFrame(Xhasher.toarray(), index=X.index, columns=[ f'{X.columns[0]}_hash_{x}' for x in range(Xhasher.shape[1]) ]) return Xhashed
def test_hasher_set_params(): # Test delayed input validation in fit (useful for grid search). hasher = FeatureHasher() hasher.set_params(n_features=np.inf) with pytest.raises(TypeError): hasher.fit()
def load_data_v1(data_path): attr_name = [ 'taxi_id', 'point', 'duration', 'time', 'duration', 'distance' ] # 训练集数据 train = pd.read_csv(os.path.join(data_path, 'train.txt'), header=None) train_set = train.values[:, [0, 1, 2, 3, 4, 5, 6]] dataset = train.values[:, [0, 1, 2, 3, 4, 5]] print(train_set[0]) # 测试集数据 test = pd.read_csv(os.path.join(data_path, 'test.txt'), header=None) test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6]] print(test_set[0]) # 测试集中除去最后一列数据存放于列表中,以出租车ID为主键 samples = list() for sample in dataset: sample_dict = dict() for index, attr in enumerate(sample): sample_dict[attr_name[index]] = attr samples.append(sample_dict) h = FeatureHasher(n_features=2048) h.fit(samples) # 训练集数据转换成x,y列表 x_train = list() y_train = list() for sample in train_set: sample_dict = dict() for index, attr in enumerate(sample): attr = str(attr) if index == 6: y_train.append(int(attr)) continue sample_dict[attr_name[index]] = attr x_train.append(sample_dict) # 测试集数据转换成x,y列表 x_test = list() y_test = list() for sample in test_set: sample_dict = dict() for index, attr in enumerate(sample): attr = str(attr) if index == 6: y_test.append(int(attr)) continue sample_dict[attr_name[index]] = attr x_test.append(sample_dict) x_train = h.transform(x_train).toarray() x_test = h.transform(x_test).toarray() print(x_train[0]) print(x_test[0]) print(x_train.shape) print(x_test.shape) y_train = np.asarray(y_train, dtype='int16') y_test = np.asarray(y_test, dtype='int16') y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test, nb_classes) print(y_train.shape) print(y_test.shape) # return x_train, y_train, x_dev, y_dev, x_test return x_train, y_train, x_test, y_test, x_test
test = pd.read_csv("test.txt") test_set = test.values[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]] print(test_set[0]) dataset = train.values[:, [0, 1, 2, 3, 4, 5, 6, 7]] samples = list() for sample in dataset: sample_dict = dict() for index, attr in enumerate(sample): sample_dict[attr_name[index]] = attr samples.append(sample_dict) h.fit(samples) X_train = list() y_train = list() X_test = list() y_test = list() for sample in train_set: sample_dict = dict() for index, attr in enumerate(sample): attr = str(attr) if index == 8: y_train.append(int(attr)) continue sample_dict[attr_name[index]] = attr X_train.append(sample_dict)
import pandas as pd from hashing import HashingEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.ensemble import RandomForestRegressor dfbgisnlp = pd.read_csv("./Final/DataPre/bgis_vendorPre_words") dfbgiscost = pd.read_csv("./Final/Data/BGIS_Vendor_scaled1hot.csv") #Removing Outliers from sklearn.feature_extraction import FeatureHasher fh = FeatureHasher(n_features=10, input_type='string') desc = dfbgisnlp["descriptions"] #len(dfbgisnlp) result=fh.fit(desc) resultin = [] for item in range(len(desc)): #resultin.append(result.transform(item)) #resultin.append(result.toarray()) print(desc) df = pd.DataFrame(result.toarray(), columns=['fh1', 'fh2', 'fh3', 'fh4', 'fh5', 'fh6', 'fh7', 'fh8','fh9','fh10']) from sklearn.datasets import load_boston dfboston = load_boston() import pandas as pd from sklearn.datasets import load_boston bunch = load_boston() y = dfbgiscost["Func_Burdened_Cost"] X = dfbgisnlp enc = HashingEncoder(cols=['descriptions']).fit(X, y) numeric_dataset = enc.transform(X)
plt.plot(numberFeatures.iloc[:,4]) def plotCorr(data,number): subData = data.iloc[number*1000:(number+1)*1000,:] corrY = subData.corr() plt.figure() plt.imshow(corrY) #plotCorr(numberFeatures,1) #plotCorr(numberFeatures,2) #%% from sklearn.feature_extraction import FeatureHasher hasher = FeatureHasher(input_type='string',dtype='float') strFeature = hasher.fit(stringFeature).transform(stringFeature).toarray() #%% normalizetoin numberFeatures = numberFeatures.fillna(0) from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(numberFeatures.iloc[:,1:]) X_scaled = scaler.transform(numberFeatures.iloc[:,1:]) import numpy as np #a = numpy.asarray([ [1,2,3], [4,5,6], [7,8,9] ]) #numpy.savetxt("/Users/weizhi/Downloads/kaggle competion/scared.csv", X_scaled, delimiter=",") where_are_NaNs = np.isnan(X_scaled) X_scaled[where_are_NaNs] = 0 #data = pd.read_csv("/Users/weizhi/Downloads/kaggle competion/scared.csv") numberFeatures = None
y = pd.read_csv(y_url) y = y.pop('status_group').values # initialize data sets x = drop_and_fill_na(x) x_prob = drop_and_fill_na(x_prob) # Fit encoders and transform x oh_enc = OneHotEncoder(handle_unknown='ignore') oh_enc.fit(x[cols_to_one_hot]) y_oh_enc = OneHotEncoder(handle_unknown='ignore') funder_feat_hasher = FeatureHasher(n_features=FEATURE_HASHER_NUM_FEATURES, input_type='string', alternate_sign=False) funder_feat_hasher.fit(x['funder']) installer_feat_hasher = FeatureHasher(n_features=FEATURE_HASHER_NUM_FEATURES, input_type='string', alternate_sign=False) installer_feat_hasher.fit(x['installer']) x = transform_data(x) x_prob = transform_data(x_prob) y = y_oh_enc.fit_transform(y.reshape(-1,1)).toarray() # split into train and test data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 27) num_labels = 3 input_size = x_train.shape[1] x_train = x_train.astype('float32') x_test = x_test.astype('float32')