def _make_dummies(data, variables): #convert to expected variables only filtered_data = data.loc[:,variables].astype(str) data_dict = [dict(r.iteritems()) for _, r in filtered_data.iterrows()] vectorizer = DV( sparse = False ) vec_x_cat = vectorizer.fit_transform(data_dict) return vec_x_cat, vectorizer
def pres_drug_dv(): """ 2.5 min on 2% dataset """ df_base = _load_base() df_base['idx'] = list(range(len(df_base))) print("load trans") df = pd.read_csv(g.FILE_PRES_OUT, usecols=['patient_id', 'drug_id']) print("load trans ... done") patient_list = [] df_feat = [] for patient_id, df_part in df.groupby('patient_id'): df_feat.append(Counter(df_part.drug_id)) patient_list.append(patient_id) X = DV(sparse=True).fit_transform(df_feat) print(X.shape) df_feat_order = pd.DataFrame({ 'patient_id': patient_list, 'feat_idx': list(range(len(patient_list))), }) df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id') print(df_feat_order.head()) # Re-ordering X_ordered = ss.lil_matrix((len(df_base), X.shape[1])) for idx, row in df_feat_order.iterrows(): X_ordered[row['idx'], :] = X[row['feat_idx'], :] X_ordered = ss.csr_matrix(X_ordered) print(X_ordered.shape) return X_ordered
def train_rfc(): out = open('rfc', 'wb') #data = pandas.read_csv('res.csv') df = pandas.read_csv('res.csv').fillna('nan') #print df.shape vectorizer = DV(sparse=False) df_dict = df.drop(['0', '11', '12'], axis=1).T.to_dict().values() #df_dict = df_dict.fillna( 'NA' ) #print df_dict X = vectorizer.fit_transform(df_dict) #print X.shape X = np.hstack([X, df[['0', '11']]]) y = df.ix[:, -1].as_matrix() #scale = StandardScaler() #X = scale.fit_transform(X) clf = RandomForestClassifier() #clf.fit(X,y) #pickle.dump(clf,out) #grid = {'C': np.power(10.0, np.arange(-5, 6))} #gs = grid_search.GridSearchCV(clf, grid) res = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='roc_auc') print(res, res.mean()) clf = GradientBoostingClassifier() clf.fit(X, y) pickle.dump(clf, out) res = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='roc_auc') clf.fit(X, y) print(res, res.mean(), clf.feature_importances_)
def categorical_2_dummy(self, df): """docstring for categorical_2_dummy""" df = df.applymap(str) ch_dict = df.T.to_dict().values() # This creates huge memory ~ 10Gb vec = DV(sparse=True) ch_array = vec.fit_transform(ch_dict) ''' ch_array = ch_array.astype('float16') # This step kills everything, only 9862926 of 57083328046 have values # Calling toarray(), it will have 2byte*57083328046=106Gb df_after = pd.DataFrame(ch_array, dtype='float16') ''' # One drawback of this, is SparseDataFrame doesn't support float32 or float16, which is a shame ''' # TODO: Wondering the cheapest way to penetrate pandas.DataFrame; why don't we just wait for stackOverflow. # TODO: simple, csr_matrix -> sparseDataFrame, without releasing toarray(), is this the most space efficient way? # TODO issue on github df_after = pd.SparseDataFrame(index=df.index, columns=vec.get_featrue_names()) for i in np.arange(ch_array.shape[0]): elem = pd.SparseSeries(ch_array[i].toarray().ravel()) df_after.loc[[2]] = [elem] # not implemendted error ''' # New method df_after = pd.DataFrame(ch_array[:, 0].toarray().ravel()).to_sparse(0) for i in range(1, ch_array.shape[1]): df_after[i] = ch_array[:, i].toarray().ravel() if i % 1000 == 0: print('Finish: ' + str(i)) return df_after
def sklearn_tree(frame, x, y): vectorize = DV(sparse=False) X = frame.ix[:, x] Y = frame.ix[:, y] del frame X_transform = vectorize.fit_transform(X.to_dict(outtype="records")) dtree = sktree.DecisionTreeRegressor(max_depth=10, min_samples_split=2000) dtree = dtree.fit(X_transform, Y) return dtree
def category_transformation(train_categoric, test_categoric, labels, type='std'): if type == 'freq': print("Encoding categories by freqency rank...") for c in train_categoric.columns: freqs = train_categoric[c].append(test_categoric[c]).value_counts() train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:91].index) test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:91].index) if type == 'std': print("Encoding categories by sklearn label encoder...") for c in train_categoric.columns: lbl = LabelEncoder() lbl.fit( list(train_categoric.ix[:, c]) + list(test_categoric.ix[:, c])) train_categoric.ix[:, c] = lbl.transform(train_categoric.ix[:, c]) test_categoric.ix[:, c] = lbl.transform(test_categoric.ix[:, c]) if type == 'tgtrate': print("Encoding categories by target rate...") for c in train_categoric.columns: train_categoric[c], test_categoric[c] = category_to_prob_weight( train_categoric, test_categoric, c, labels) if type == 'rank': print("Encoding categories by rank transformation...") for c in train_categoric.columns: rank = pd.concat([train_categoric[c], labels], axis=1).groupby(c).mean().sort_values( by='target', ascending=False) train_categoric[c] = pd.match(train_categoric[c].values, rank[0:20000].index) test_categoric[c] = pd.match(test_categoric[c].values, rank[0:20000].index) if type == 'onehot': print("One hot... ") for c in train_categoric.columns: uniques = np.unique(train_categoric[c]) if len(uniques) > 100: train_categoric.drop(c, axis=1, inplace=True) test_categoric.drop(c, axis=1, inplace=True) x_cat_train = train_categoric.T.to_dict().values() x_cat_test = test_categoric.T.to_dict().values() # vectorize vectorizer = DV(sparse=False) train_categoric = pd.DataFrame(vectorizer.fit_transform(x_cat_train)) test_categoric = pd.DataFrame(vectorizer.transform(x_cat_test)) return train_categoric, test_categoric
def _transfer_data_to_model(self, data, animal, total_info, logger): ''' extract data from DataFrame''' total_breed = total_info[0] total_color = total_info[1] intake_df = total_info[2] # encode y (encode_y, le_y) = self._encode_y(data['OutcomeType'].values,logger) #print encode_y # encode x #if animal in ('Dog', 'All'): if True: #if False: new_age_info = self._transfer_age_infos(data['AgeuponOutcome']) data['EncodeAgeuponOutcome'] = new_age_info (year, month, weekday, hour) = self._transfer_time_infos(data['DateTime']) data['EncodeYear'] = year data['EncodeMonth'] = month data['EncodeWeekday'] = weekday data['EncodeHour'] = hour drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome', 'Breed', 'Color'] #drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome', 'Breed'] #drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome'] data['HasName'] = self._transfer_name_infos(data['Name']) data['Sex'] = self._transfer_sex_infos(data['SexuponOutcome']) data['Intact'] = self._transfer_intact_infos(data['SexuponOutcome']) data['IsMix'] = self._transfer_mix_infos(data['Breed']) #data['NewBreed'] = self._transfer_breed_infos(data['Breed']) #data['Species'] = self._transfer_species_infos(data['Color']) #data['NewColor'] = self._transfer_color_infos(data['Color']) data['ColorMix'] = self._transfer_color_count_infos(data['Color']) for breed_type in total_breed: data['Breed%s' % breed_type] = self._transfer_breed_type_infos(data['Breed'], breed_type) for color_type in total_color: data['Color%s' % color_type] = self._transfer_color_type_infos(data['Color'], color_type) (found_location, intake_type, intake_condition) = self._transfer_intake_infos(data['AnimalID'], intake_df) #data['FoundLocation'] = found_location data['IntakeType'] = intake_type data['IntakeCondition'] = intake_condition #print np.isnan(data.any()) #print np.isfinite(data.all()) df = data.drop(drop_list, 1) #print df.isnull().sum() #print pd.isnull(df).any(1).nonzero()[0] x = df.T.to_dict().values() #print x vectorizer_x = DV(sparse=False) encode_x = vectorizer_x.fit_transform(x) #print encode_x return (encode_x, encode_y, vectorizer_x, le_y)
def load_data(): train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') train.drop(['v22', 'v91'], axis=1, inplace=True) test.drop(['v22', 'v91'], axis=1, inplace=True) nas = {} for colname in cat_cols: nas[colname] = impute_most_freq_value(train, colname) for colname in cat_cols: train[colname].fillna(nas[colname], inplace=True) for colname in cat_cols: test[colname].fillna(nas[colname], inplace=True) cat_train = train[cat_cols] cat_test = test[cat_cols] # put the numerical as matrix train.drop(cat_cols, axis=1, inplace=True) test.drop(cat_cols, axis=1, inplace=True) print(cat_train.describe()) # transform the categorical to dict dict_train_data = cat_train.T.to_dict().values() dict_test_data = cat_test.T.to_dict().values() # vectorize vectorizer = DV(sparse=False) features = vectorizer.fit_transform(dict_train_data) vec_data = pd.DataFrame(features) vec_data.columns = vectorizer.get_feature_names() # vec_data.rename(columns={'changed': 'vec_changed'}, inplace=True) # vec_data.rename(columns={'id': 'vec_id'}, inplace=True) vec_data.index = train.index train = train.join(vec_data) features = vectorizer.transform(dict_test_data) vec_data = pd.DataFrame(features) vec_data.columns = vectorizer.get_feature_names() vec_data.index = test.index test = test.join(vec_data) # merge numerical and categorical sets trainend = int(0.75 * len(train)) valid_inds = list(train[trainend:].index.values) train_inds = list(train.loc[~train.index.isin(valid_inds)].index.values) train.fillna(-100, inplace=True) test.fillna(-100, inplace=True) return train, test, train_inds, valid_inds
def matrix (wordlength,length,instance): di = {'i': 0, 'o': 0, 'P': 1, 'L':0} damino = {'A': 1,'R': 2,'D': 3,'N': 4,'C': 5,'E': 6,'Q': 7,'G': 8,'H': 9,'I': 10,'L': 11,'K': 12,'M': 13,'F': 14,'P': 15,'S': 16,'T': 17,'W': 18,'Y': 19,'V': 20,'J': 21} word_list = [] word_list_w = [] toplogy_list = [] toplogy_w = [] tempd = '' z = wordpro(wordlength) filein = open('prototext.txt','r') for line in filein: temp_line = line.rstrip() #Adding charachters in the begening and end for windows temporary_string = ("J" * z)+(temp_line)+("J" * z) for each in window(temporary_string, wordlength): temp = ''.join(each) temp_c = [] for c in temp: g = damino[c] temp_c.append(g) word_list.append(temp) temporary_topology = next(filein) temporary_topology = temporary_topology.rstrip() for c in temporary_topology: k = di[c] toplogy_list.append(k) toplogy_w.append(c) #http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe #http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe dftemp = pd.DataFrame({'word_list':word_list}) dwtemp = pd.DataFrame({'word_list':word_list_w}) if(length == 21): df = pd.read_csv("finaltest.csv") train_dict = df.T.to_dict().values() #print (train_dict) vectorizer = DV( sparse = False ) vec_train = vectorizer.fit_transform( train_dict ) max_abs_scaler = preprocessing.MaxAbsScaler() vec_train = max_abs_scaler.fit_transform(vec_train) print (vectorizer.get_feature_names()) target = np.asarray(toplogy_list) X_train, X_test, y_train, y_test = train_test_split(vec_train, target, test_size=0.2, random_state=0) estimator = svm.SVC(kernel='rbf') cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0) gammas = np.logspace(-6, -1, 10) classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(gamma=gammas)) X_train, X_test, y_train, y_test = train_test_split(vec_train, target, test_size=0.33, random_state=0) classifier.fit(vec_train, target) classifier_prediction = classifier.predict(X_test) print ('\nClasification report P:\n', classification_report(y_test, classifier_prediction)) plot_classification_report(classification_report(y_test, classifier_prediction)) plt.savefig('P_plot_classif_report.pdf', dpi=200, format='pdf', bbox_inches='tight') plt.close() joblib.dump(classifier, 'P.pkl', compress=9)
def diag_physician_specialty_description_dv(): df_base = _load_base() df_base['idx'] = list(range(len(df_base))) print("load trans") df = pd.read_csv( g.FILE_DIAG_OUT, usecols=['patient_id', 'primary_practitioner_id'], dtype={'primary_practitioner_id': str}, ).rename(columns={ 'primary_practitioner_id': 'practitioner_id'}) df_phy = pd.read_csv( g.FILE_PHYS, usecols=['specialty_description', 'practitioner_id'], dtype={ 'practitioner_id': str, 'specialty_description': str}) df['practitioner_id'].fillna('NA', inplace=True) df_phy['practitioner_id'].fillna('NA', inplace=True) df_phy['specialty_description'].fillna('NA', inplace=True) df = df.merge(df_phy, how='left', on='practitioner_id') print("load trans ... done") patient_list = [] df_feat = [] for patient_id, df_part in df.groupby('patient_id'): df_feat.append(Counter(df_part.specialty_description)) patient_list.append(patient_id) X = DV(sparse=True).fit_transform(df_feat) print(X.shape) df_feat_order = pd.DataFrame({ 'patient_id': patient_list, 'feat_idx': list(range(len(patient_list))), }) df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id') print(df_feat_order.head()) # Re-ordering print("re-ordering") sz = len(df_feat_order) X_ordered = ss.lil_matrix((len(df_base), X.shape[1])) for idx, row in df_feat_order.iterrows(): if idx % 1000 == 0: print(str(datetime.datetime.now()), idx, sz) X_ordered[row['idx'], :] = X[row['feat_idx'], :] print("re-ordering ... done") X_ordered = ss.csr_matrix(X_ordered) print(X_ordered.shape) return X_ordered
def encode_cat_test(X): print('\nSource data:\n') print(X.shape) print(X[:10]) encoder = DV(sparse=False) X_cat = encoder.fit_transform(X.T.to_dict().values()) print('\nEncoded data:\n') print(X_cat.shape) print(X_cat[:10]) print('\nVocabulary:\n') print(encoder.vocabulary_) print(encoder.feature_names_) return X_cat
def simple_mod_v1(): data = pd.read_csv('./mushrooms.csv') data.drop_duplicates() #STEP ONE: PREPARE DATA# features = ['stalk-color-above-ring', 'spore-print-color', "gill-color"] print(features) data_x = data[features] data_y = data['class'] le = preprocessing.LabelEncoder() le.fit(data_y) data_y = le.transform(data_y) data_x_dict = data_x.to_dict(orient='records') v = DV(sparse=False) data_x_dict = v.fit_transform(data_x_dict) #STEP TWO: SPLIT THE DATA# x_train, x_test, y_train, y_test = train_test_split(data_x_dict, data_y, test_size=0.3) #STEP THREE: CREATE MODEL# print('----------- DTREE WITH GINI IMPURITY CRITERION ------------------') dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini') dtree_gini_mod.fit(x_train, y_train) preds_gini = dtree_gini_mod.predict(x_test) print_multiclass_classif_error_report(y_test, preds_gini) #STEP FOUR: VALIDATE MODEL# print( '----------- VALIDATE: DTREE WITH GINI IMPURITY CRITERION ------------------' ) data_v = pd.read_csv('./m_v.csv') features_v = list(data) features_v.remove('class') data_x_v = data_v[features] data_y_v = data_v['class'] data_y_v = le.transform(data_y_v) data_x_dict_v = data_x_v.to_dict(orient='records') data_x_dict_v = v.transform(data_x_dict_v) preds_gini_v = dtree_gini_mod.predict(data_x_dict_v) print_multiclass_classif_error_report(data_y_v, preds_gini_v) return (dtree_gini_mod, le, v)
def preprocessData(data): # extract categorical columns header_not_cat = list(data.columns.values) header_not_cat.remove('property_type') X_cat = data.drop(header_not_cat, axis=1) # convert categorical data to dict X_cat.fillna('NA', inplace=True) X_cat = X_cat.T.to_dict().values() # vectorize categorical feature vec = DV(sparse=False) X_cat = vec.fit_transform(X_cat) #print vec.get_feature_names() # extract numerical columns header_not_num = list(data.columns.values) header_not_num.remove('property_type') header_not_num.remove('price') X_num = data.drop(['property_type', 'price'], axis=1) X_num = X_num.values # pandas dataframe to numpy array # impute n/a value (replace it with mean value) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X_num = imp.fit_transform(X_num) # scale the data #X_num = preprocessing.scale(X_num) X_scaler = preprocessing.StandardScaler().fit(X_num) X_num = X_scaler.transform(X_num) # combine numerical data and vectorized categorical data X = np.hstack((X_num, X_cat)) # extract label column (predicted values: price) header_features = list(data.columns.values) header_features.remove('price') Y = data.drop(header_features, axis=1) Y = Y.price # standardize label column Y_scaler = preprocessing.StandardScaler().fit(Y) Y = Y_scaler.transform(Y) header = header_not_num + vec.get_feature_names() header = np.array(header) return X, Y, X_scaler, Y_scaler, header, imp, vec
def oneHotEncoding(train, numeric_cols): # receives the clean tain and test data # in: train and test numpy matrix x_num_train = train[numeric_cols].as_matrix() #x_num_test = test[numeric_cols].as_matrix() cat_train = train.drop(numeric_cols, axis=1) #cat_test = test.drop(numeric_cols, axis=1) x_cat_train = cat_train.T.to_dict().values() #x_cat_test = cat_test.T.to_dict().values() # 5.1 vectorize vectorizer = DV(sparse=False) vec_x_cat_train = vectorizer.fit_transform(x_cat_train) #vec_x_cat_test = vectorizer.transform(x_cat_test) # complete x x_train = np.hstack((x_num_train, vec_x_cat_train)) #x_test = np.hstack((x_num_test, vec_x_cat_test)) return x_train
def cleanData(traindf: pd.DataFrame, testdf: pd.DataFrame, describe=False) -> (pd.DataFrame, pd.DataFrame): traindf.drop(['v22', 'v91'], axis=1, inplace=True) testdf.drop(['v22', 'v91'], axis=1, inplace=True) nas = {} for colname in objectCols: nas[colname] = compute_most_freq_value(traindf, colname) for colname in objectCols: traindf[colname].fillna(nas[colname], inplace=True) for colname in objectCols: testdf[colname].fillna(nas[colname], inplace=True) cat_train = traindf[objectCols] cat_test = testdf[objectCols] traindf.drop(objectCols, axis=1, inplace=True) testdf.drop(objectCols, axis=1, inplace=True) dict_train_data = cat_train.T.to_dict().values() dict_test_data = cat_test.T.to_dict().values() #vectorize vectorizer = DV(sparse=False) features = vectorizer.fit_transform(dict_train_data) vec_data = pd.DataFrame(features) vec_data.columns = vectorizer.get_feature_names() vec_data.index = traindf.index traindf = traindf.join(vec_data) features = vectorizer.transform(dict_test_data) vec_data = pd.DataFrame(features) vec_data.columns = vectorizer.get_feature_names() vec_data.index = testdf.index testdf = testdf.join(vec_data) traindf.fillna(traindf.mean(), inplace=True) testdf.fillna(testdf.mean(), inplace=True) if describe: describeDataframe(traindf) return traindf, testdf
def pres_drug_bb_usc_code_dv(): df_base = _load_base() df_base['idx'] = list(range(len(df_base))) print("load trans") df = pd.read_csv( g.FILE_PRES_OUT, usecols=['patient_id', 'drug_id'], dtype={'drug_id': str}) print("load trans ... done") df['drug_id'].fillna('NA', inplace=True) # Drug df_drug = pd.read_csv( g.FILE_DRUG, usecols=['drug_id', 'BB_USC_code'], dtype={'drug_id': str, 'BB_USC_code': str}) df_drug['drug_id'].fillna('NA', inplace=True) df_drug['BB_USC_code'].fillna('NA', inplace=True) df = df.merge(df_drug, how='left', on='drug_id') patient_list = [] df_feat = [] for patient_id, df_part in df.groupby('patient_id'): df_feat.append(Counter(df_part.BB_USC_code)) patient_list.append(patient_id) X = DV(sparse=True).fit_transform(df_feat) print(X.shape) df_feat_order = pd.DataFrame({ 'patient_id': patient_list, 'feat_idx': list(range(len(patient_list))), }) df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id') print(df_feat_order.head()) # Re-ordering X_ordered = ss.lil_matrix((len(df_base), X.shape[1])) for idx, row in df_feat_order.iterrows(): X_ordered[row['idx'], :] = X[row['feat_idx'], :] X_ordered = ss.csr_matrix(X_ordered) print(X_ordered.shape) return X_ordered
def proc_procedure_code_dv(): df_base = _load_base() df_base['idx'] = list(range(len(df_base))) print("load trans") df = pd.read_csv( g.FILE_PROC_OUT, usecols=['patient_id', 'procedure_code'], dtype={'procedure_code': str}, ) df['procedure_code'].fillna('NA', inplace=True) print("load trans ... done") patient_list = [] df_feat = [] for patient_id, df_part in df.groupby('patient_id'): df_feat.append(Counter(df_part.procedure_code)) patient_list.append(patient_id) X = DV(sparse=True).fit_transform(df_feat) print(X.shape) df_feat_order = pd.DataFrame({ 'patient_id': patient_list, 'feat_idx': list(range(len(patient_list))), }) df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id') print(df_feat_order.head()) # Re-ordering print("re-ordering") sz = len(df_feat_order) X_ordered = ss.lil_matrix((len(df_base), X.shape[1])) for idx, row in df_feat_order.iterrows(): if idx % 1000 == 0: print(str(datetime.datetime.now()), idx, sz) X_ordered[row['idx'], :] = X[row['feat_idx'], :] print("re-ordering ... done") X_ordered = ss.csr_matrix(X_ordered) print(X_ordered.shape) return X_ordered
def fitData(folds, regressor, features): num_features = features.select_dtypes(exclude=['object']) num_features.fillna(0, inplace=True) obj_features = features.select_dtypes(include=['object']) obj_features.fillna('empty', inplace=True) encoder = DV(sparse=False) encoded_data = encoder.fit_transform(obj_features.T.to_dict().values()) newFeatures = np.hstack([num_features, encoded_data]) score = np.empty([1, 2]) for [trainInds, testInds] in folds: regressor.fit(newFeatures[trainInds, :], price[trainInds]) y_pr = regressor.predict(newFeatures[testInds, :]) pr = myScore(price[testInds], y_pr) print pr score = np.append(score, pr, axis=0) score = np.delete(score, 0, 0) return score
def preprocess(self): logging.debug("Pre-processing data...") # Same train and test - union of train/test features. x_train_cols = self.train_data.columns x_test_cols = self.test_data.columns x_intersect_cols = [x for x in x_train_cols if x in x_test_cols] only_train_cols = [x for x in x_train_cols if x not in x_test_cols] only_test_cols = [x for x in x_test_cols if x not in x_train_cols] logging.debug("Train columns: %s Test columns: %s", len(self.train_data.columns), len(self.test_data.columns)) logging.info("Only train: %s", only_train_cols) logging.info("Only test: %s", only_test_cols) self.test_data = self.test_data[x_intersect_cols] self.train_data = self.train_data[x_intersect_cols] logging.debug("Train columns: %s Test columns: %s", len(self.train_data.columns), len(self.test_data.columns)) drop_cols = [ self.problem_definition.y_column, self.problem_definition.id_column, self.problem_definition.grouping_column ] x_num_train = self.train_data.select_dtypes( include=['int64', 'float']).drop(drop_cols, axis=1) col_names_num = x_num_train.columns.values x_num_train = x_num_train.as_matrix() x_num_test = self.test_data.select_dtypes( include=['int64', 'float']).drop(drop_cols, axis=1).as_matrix() imp = Imputer(missing_values='NaN', strategy='mean', axis=0) x_num_train = imp.fit_transform(x_num_train) x_num_test = imp.fit_transform(x_num_test) # # scale to <0,1> # max_train = np.amax(x_num_train, 0) # print(max_train) # print("IsNaN train:") # print(np.any(np.isnan(x_num_train))) min_max_scaler = preprocessing.MinMaxScaler() x_num_train = min_max_scaler.fit_transform(x_num_train) x_num_test = min_max_scaler.fit_transform( x_num_test) # scale test by max_train cat_train = self.train_data.select_dtypes(include=['object']) drop_cols = np.intersect1d(drop_cols, cat_train.columns.values) cat_train = cat_train.drop(drop_cols, axis=1) cat_test = self.test_data.select_dtypes(include=['object']) cat_test = cat_test.drop(drop_cols, axis=1) cat_train.fillna('NA', inplace=True) cat_test.fillna('NA', inplace=True) x_cat_train = cat_train.T.to_dict().values() x_cat_test = cat_test.T.to_dict().values() # vectoring vectorizer = DV(sparse=False) vec_x_cat_train = vectorizer.fit_transform(x_cat_train) vec_x_cat_test = vectorizer.transform(x_cat_test) col_names_cat = np.asarray(vectorizer.get_feature_names()) self.col_names = np.hstack((col_names_num, col_names_cat)) self.x_train = np.hstack((x_num_train, vec_x_cat_train)) self.x_test = np.hstack((x_num_test, vec_x_cat_test)) # HACK: This should be treated in a better way!!!! self.y_train = 1. - self.train_data[self.label] self.y_test = 1. - self.test_data[self.label] logging.info('Train 0: %s', len(np.where(self.y_train < 1)[0])) logging.info('Train 1: %s', len(np.where(self.y_train > 0)[0])) logging.info('Test 0: %s', len(np.where(self.y_test < 1)[0])) logging.info('Test 1: %s', len(np.where(self.y_test > 0)[0]))
# Import CSV data = readCsvIntoPandasDataframe(csvfile) # extract categorical columns (cat = categorical) header_not_cat = list(data.columns.values) header_not_cat.remove('property_type') X_cat = data.drop(header_not_cat, axis=1) # convert to dict X_cat.fillna('NA', inplace=True) X_cat = X_cat.T.to_dict().values() # vectorize categorical feature vec = DV(sparse=False) X_cat = vec.fit_transform(X_cat) print vec.get_feature_names() # extract numerical columns (num = numerical) header_not_num = list(data.columns.values) header_not_num.remove('property_type') header_not_num.remove('price') X_num = data.drop(['property_type', 'price'], axis=1) X_num = X_num.values # pandas dataframe to numpy array # impute n/a value (replace it with mean value) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) X_num = imp.fit_transform(X_num) # scale the data
from sklearn.feature_extraction import DictVectorizer as DV from sklearn.cross_validation import train_test_split from skll import kappa import os #outputDf = pd.DataFrame({"first" : [],"second" : []}) #outputDf = outputDf.append(pd.DataFrame({"first" : [2],"second" : [3]})) os.chdir("/Users/swapnil/work/Kaggle/out/PLIA") print "hello" df1Data = {"col1" : [1,2,3],"col2" : ["swap","kals","bang"]} df1 = pd.DataFrame(data = df1Data) d = DV(sparse = False) d1 = d.fit_transform(df1.T.to_dict().values()) #print d1 df2Data = {"col1" : [1,20,30,40],"col2" : ["swap","kals1","bang","nag"],"col3":[22,33,44,55]} df2 = pd.DataFrame(data = df2Data) d2 = d.transform(df2.T.to_dict().values()) print d2 d3 = d2[0:2,0:d2.shape[1]] #print d.get_feature_names()
[column for column in train.columns if column not in skip_feature_columns]): if type(train[column][0] == str): try: train[column] = train[column].apply(dollar_to_numeric) test[column] = test[column].apply(dollar_to_numeric) except ValueError: pass if type(train[column][0]) in [numpy.float64, numpy.int64]: numeric.append(column) else: categorical.append(column) print column, train[column].nunique(), type( train[column][0]), train[column][0] numeric, categorical vectorizer = DV(sparse=False) X_train_cat = vectorizer.fit_transform( train[categorical].fillna('NA').T.to_dict().values()) X_test_cat = vectorizer.transform( test[categorical].fillna('NA').T.to_dict().values()) X_train = numpy.hstack([train[numeric].fillna(-999).values, X_train_cat]) y_train = train[target] X_test = numpy.hstack([test[numeric].fillna(-999).values, X_test_cat]) test_ids = test[index]
def matrix(wordlength, length, instance): di = {'i': 1, 'o': 2, 'P': 3, 'L': 4} damino = { 'A': 1, 'R': 2, 'D': 3, 'N': 4, 'C': 5, 'E': 6, 'Q': 7, 'G': 8, 'H': 9, 'I': 10, 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'S': 16, 'T': 17, 'W': 18, 'Y': 19, 'V': 20, 'J': 21 } word_list = [] word_list_w = [] toplogy_list = [] toplogy_w = [] tempd = '' z = wordpro(wordlength) filein = open('prototext.txt', 'r') for line in filein: temp_line = line.rstrip() #Adding charachters in the begening and end for windows temporary_string = ("J" * z) + (temp_line) + ("J" * z) for each in window(temporary_string, wordlength): temp = ''.join(each) temp_c = [] for c in temp: g = damino[c] temp_c.append(g) word_list.append(temp) temporary_topology = next(filein) temporary_topology = temporary_topology.rstrip() for c in temporary_topology: k = di[c] toplogy_list.append(k) toplogy_w.append(c) #http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe #http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe dftemp = pd.DataFrame({'word_list': word_list}) dwtemp = pd.DataFrame({'word_list': word_list_w}) if (length == 3): df = pd.DataFrame({ 'p-1': dftemp['word_list'].str[0], 'p': dftemp['word_list'].str[1], 'p+1': dftemp['word_list'].str[2] }) if (length == 5): df = pd.DataFrame({ 'p-2': dftemp['word_list'].str[0], 'p-1': dftemp['word_list'].str[1], 'p': dftemp['word_list'].str[2], 'p+1': dftemp['word_list'].str[3], 'p+2': dftemp['word_list'].str[4] }) if (length == 7): df = pd.DataFrame({ 'p-3': dftemp['word_list'].str[0], 'p-2': dftemp['word_list'].str[1], 'p-1': dftemp['word_list'].str[2], 'p': dftemp['word_list'].str[3], 'p+1': dftemp['word_list'].str[4], 'p+2': dftemp['word_list'].str[5], 'p+3': dftemp['word_list'].str[6] }) if (length == 9): df = pd.DataFrame({ 'p-4': dftemp['word_list'].str[0], 'p-3': dftemp['word_list'].str[1], 'p-2': dftemp['word_list'].str[2], 'p-1': dftemp['word_list'].str[3], 'p': dftemp['word_list'].str[4], 'p+1': dftemp['word_list'].str[5], 'p+2': dftemp['word_list'].str[6], 'p+3': dftemp['word_list'].str[7], 'p+4': dftemp['word_list'].str[8] }) if (length == 11): df = pd.DataFrame({ 'p-5': dftemp['word_list'].str[0], 'p-4': dftemp['word_list'].str[1], 'p-3': dftemp['word_list'].str[2], 'p-2': dftemp['word_list'].str[3], 'p-1': dftemp['word_list'].str[4], 'p': dftemp['word_list'].str[5], 'p+1': dftemp['word_list'].str[6], 'p+2': dftemp['word_list'].str[7], 'p+3': dftemp['word_list'].str[8], 'p+4': dftemp['word_list'].str[9], 'p+5': dftemp['word_list'].str[10] }) if (length == 13): df = pd.DataFrame({ 'p-6': dftemp['word_list'].str[0], 'p-5': dftemp['word_list'].str[1], 'p-4': dftemp['word_list'].str[2], 'p-3': dftemp['word_list'].str[3], 'p-2': dftemp['word_list'].str[4], 'p-1': dftemp['word_list'].str[5], 'p': dftemp['word_list'].str[6], 'p+1': dftemp['word_list'].str[7], 'p+2': dftemp['word_list'].str[8], 'p+3': dftemp['word_list'].str[9], 'p+4': dftemp['word_list'].str[10], 'p+5': dftemp['word_list'].str[11], 'p+6': dftemp['word_list'].str[12] }) if (length == 15): df = pd.DataFrame({ 'p-7': dftemp['word_list'].str[0], 'p-6': dftemp['word_list'].str[1], 'p-5': dftemp['word_list'].str[2], 'p-4': dftemp['word_list'].str[3], 'p-3': dftemp['word_list'].str[4], 'p-2': dftemp['word_list'].str[5], 'p-1': dftemp['word_list'].str[6], 'p': dftemp['word_list'].str[7], 'p+1': dftemp['word_list'].str[8], 'p+2': dftemp['word_list'].str[9], 'p+3': dftemp['word_list'].str[10], 'p+4': dftemp['word_list'].str[11], 'p+5': dftemp['word_list'].str[12], 'p+6': dftemp['word_list'].str[13], 'p+7': dftemp['word_list'].str[14] }) if (length == 17): df = pd.DataFrame({ 'p-8': dftemp['word_list'].str[0], 'p-7': dftemp['word_list'].str[1], 'p-6': dftemp['word_list'].str[2], 'p-5': dftemp['word_list'].str[3], 'p-4': dftemp['word_list'].str[4], 'p-3': dftemp['word_list'].str[5], 'p-2': dftemp['word_list'].str[6], 'p-1': dftemp['word_list'].str[7], 'p': dftemp['word_list'].str[8], 'p+1': dftemp['word_list'].str[9], 'p+2': dftemp['word_list'].str[10], 'p+3': dftemp['word_list'].str[11], 'p+4': dftemp['word_list'].str[12], 'p+5': dftemp['word_list'].str[13], 'p+6': dftemp['word_list'].str[14], 'p+7': dftemp['word_list'].str[15], 'p+8': dftemp['word_list'].str[16] }) if (length == 19): df = pd.DataFrame({ 'p-9': dftemp['word_list'].str[0], 'p-8': dftemp['word_list'].str[1], 'p-7': dftemp['word_list'].str[2], 'p-6': dftemp['word_list'].str[3], 'p-5': dftemp['word_list'].str[4], 'p-4': dftemp['word_list'].str[5], 'p-3': dftemp['word_list'].str[6], 'p-2': dftemp['word_list'].str[7], 'p-1': dftemp['word_list'].str[8], 'p': dftemp['word_list'].str[9], 'p+1': dftemp['word_list'].str[10], 'p+2': dftemp['word_list'].str[11], 'p+3': dftemp['word_list'].str[12], 'p+4': dftemp['word_list'].str[13], 'p+5': dftemp['word_list'].str[14], 'p+6': dftemp['word_list'].str[15], 'p+7': dftemp['word_list'].str[16], 'p+8': dftemp['word_list'].str[17], 'p+9': dftemp['word_list'].str[18] }) if (length == 21): df = pd.DataFrame({ 'p-10': dftemp['word_list'].str[0], 'p-9': dftemp['word_list'].str[1], 'p-8': dftemp['word_list'].str[2], 'p-7': dftemp['word_list'].str[3], 'p-6': dftemp['word_list'].str[4], 'p-5': dftemp['word_list'].str[5], 'p-4': dftemp['word_list'].str[6], 'p-3': dftemp['word_list'].str[7], 'p-2': dftemp['word_list'].str[8], 'p-1': dftemp['word_list'].str[9], 'p': dftemp['word_list'].str[10], 'p+1': dftemp['word_list'].str[11], 'p+2': dftemp['word_list'].str[12], 'p+3': dftemp['word_list'].str[13], 'p+4': dftemp['word_list'].str[14], 'p+5': dftemp['word_list'].str[15], 'p+6': dftemp['word_list'].str[16], 'p+7': dftemp['word_list'].str[17], 'p+8': dftemp['word_list'].str[18], 'p+9': dftemp['word_list'].str[19], 'p+10': dftemp['word_list'].str[20] }) train_dict = df.T.to_dict().values() #print (train_dict) vectorizer = DV(sparse=False) vec_train = vectorizer.fit_transform(train_dict) print(vectorizer.get_feature_names()) target = np.asarray(toplogy_list) X_train, X_test, y_train, y_test = train_test_split(vec_train, target, test_size=0.2, random_state=0) estimator = svm.SVC(kernel='rbf') cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0) gammas = np.logspace(-6, -1, 10) classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(gamma=gammas)) classifier.fit(X_train, y_train) title = 'Learning Curves (SVM, rbf kernel, $\gamma=%.6f$)' % classifier.best_estimator_.gamma estimator = svm.SVC(kernel='rbf', gamma=classifier.best_estimator_.gamma) plot_learning_curve(estimator, title, X_train, y_train, cv=cv) plt.savefig('rbf-word-%04d.pdf' % instance) print(classifier.score(X_test, y_test))
def pre_encode(self, X): # encode static cols if self.label_col not in self.static_cols: self.static_cols.append(self.label_col) if self.case_id_col not in self.static_cols: self.static_cols.append(self.case_id_col) data_final = X[X[self.event_nr_col] == 1][self.static_cols] # encode dynamic cols for i in range(1, self.max_events + 1): data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] + self.dynamic_cols] data_selected.columns = [self.case_id_col] + [ "%s_%s%s" % (col, self.dyn_event_marker, i) for col in self.dynamic_cols ] data_final = pd.merge(data_final, data_selected, on=self.case_id_col, how="left") # encode last state cols for i in range(1, self.max_events + 1): data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] + self.last_state_cols] data_selected.columns = [self.case_id_col] + [ "%s_%s%s" % (col, self.last_event_marker, i) for col in self.last_state_cols ] data_final = pd.merge(data_final, data_selected, on=self.case_id_col, how="left") if i > 1: for col in self.last_state_cols: missing = pd.isnull( data_final["%s_%s%s" % (col, self.last_event_marker, i)]) data_final["%s_%s%s" % (col, self.last_event_marker, i)].loc[missing] = data_final[ "%s_%s%s" % (col, self.last_event_marker, i - 1)].loc[missing] # make categorical dynamic_cat_cols = [ col for col in self.cat_cols if col in self.dynamic_cols ] static_cat_cols = [ col for col in self.cat_cols if col in self.static_cols ] categorical_cols = [ "%s_%s%s" % (col, self.dyn_event_marker, i) for i in range(1, self.max_events + 1) for col in dynamic_cat_cols ] + static_cat_cols cat_df = data_final[categorical_cols] cat_dict = cat_df.T.to_dict().values() vectorizer = DV(sparse=False) vec_cat_dict = vectorizer.fit_transform(cat_dict) cat_data = pd.DataFrame(vec_cat_dict, columns=vectorizer.feature_names_) data_final = pd.concat( [data_final.drop(categorical_cols, axis=1), cat_data], axis=1) data_final = pd.merge(data_final, X.groupby( self.case_id_col)[self.event_nr_col].agg({ "case_length": "max" }).reset_index(), on=self.case_id_col, how="left") # fill NA if self.fillna: for col in data_final: dt = data_final[col].dtype if dt == int or dt == float: data_final[col].fillna(0, inplace=True) else: data_final[col].fillna("", inplace=True) return data_final
def _complex_encode(self, X): # he gives one # encode static cols if self.label_col not in self.static_cols: self.static_cols.append(self.label_col) if self.case_id_col not in self.static_cols: self.static_cols.append(self.case_id_col) data_final = X[X[self.event_nr_col] == 1][self.static_cols] # encode dynamic cols print(self.nr_events) for i in range(1, self.nr_events + 1): data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] + self.dynamic_cols] data_selected.columns = [self.case_id_col] + [ "%s_%s" % (col, i) for col in self.dynamic_cols ] data_final = pd.merge(data_final, data_selected, on=self.case_id_col, how="right") print(data_final.columns) # encode last state cols for col in self.last_state_cols: data_final = pd.merge(data_final, X[X[self.event_nr_col] == self.nr_events][[ self.case_id_col, col ]], on=self.case_id_col, how="right") for idx, row in data_final.iterrows(): current_nr_events = self.nr_events - 1 while pd.isnull(data_final.loc[idx, col]) and current_nr_events > 0: data_final.loc[idx, col] = X[ (X[self.case_id_col] == row[self.case_id_col]) & (X[self.event_nr_col] == current_nr_events )].iloc[0][col] current_nr_events -= 1 # make categorical dynamic_cat_cols = [ col for col in self.cat_cols if col in self.dynamic_cols ] static_cat_cols = [ col for col in self.cat_cols if col in self.static_cols ] catecorical_cols = [ "%s_%s" % (col, i) for i in range(1, self.nr_events + 1) for col in dynamic_cat_cols ] + static_cat_cols cat_df = data_final[catecorical_cols] cat_dict = cat_df.T.to_dict().values() vectorizer = DV(sparse=False) vec_cat_dict = vectorizer.fit_transform(cat_dict) cat_data = pd.DataFrame(vec_cat_dict, columns=vectorizer.feature_names_) data_final = pd.concat( [data_final.drop(catecorical_cols, axis=1), cat_data], axis=1) if self.fitted_columns is not None: missing_cols = self.fitted_columns[~self.fitted_columns. isin(data_final.columns)] for col in missing_cols: data_final[col] = 0 data_final = data_final[self.fitted_columns] else: self.fitted_columns = data_final.columns # fill NA if self.fillna: for col in data_final: dt = data_final[col].dtype if dt == int or dt == float: data_final[col].fillna(0, inplace=True) else: data_final[col].fillna("", inplace=True) print('data_final ', data_final) return data_final
def matrix(wordlength, length, instance): di = {'i': 0, 'o': 1, 'P': 2, 'L': 3} damino = { 'A': 1, 'R': 2, 'D': 3, 'N': 4, 'C': 5, 'E': 6, 'Q': 7, 'G': 8, 'H': 9, 'I': 10, 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'S': 16, 'T': 17, 'W': 18, 'Y': 19, 'V': 20, 'J': 21 } word_list = [] word_list_w = [] toplogy_list = [] toplogy_w = [] tempd = '' counter = 0 z = wordpro(wordlength) filein = open('prototext.txt', 'r') for line in filein: temp_line = line.rstrip() #Adding charachters in the begening and end for windows temporary_string = ("J" * z) + (temp_line) + ("J" * z) for each in window(temporary_string, wordlength): temp = ''.join(each) temp_c = [] for c in temp: g = damino[c] temp_c.append(g) word_list.append(temp) temporary_topology = next(filein) temporary_topology = temporary_topology.rstrip() for c in temporary_topology: k = di[c] toplogy_list.append(k) toplogy_w.append(c) #http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe #http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe dftemp = pd.DataFrame({'word_list': word_list}) dwtemp = pd.DataFrame({'word_list': word_list_w}) if (length == 21): df = pd.read_csv("finaltest.csv") train_dict = df.T.to_dict().values() #print (train_dict) vectorizer = DV(sparse=False) vec_train = vectorizer.fit_transform(train_dict) max_abs_scaler = preprocessing.MaxAbsScaler() vec_train = max_abs_scaler.fit_transform(vec_train) print(vectorizer.get_feature_names()) target = np.asarray(toplogy_list) estimator = svm.SVC(kernel='rbf', gamma=0.027826, class_weight="balanced") classifier = estimator cv = cross_validation.KFold(22165, n_folds=10, shuffle=False, random_state=None) class_names = ['i', 'o', 'P', 'L'] for train_index, test_index in cv: counter = counter + 1 X_tr, X_tes = vec_train[train_index], vec_train[test_index] y_tr, y_tes = target[train_index], target[test_index] clf = classifier.fit(X_tr, y_tr) y_pred = clf.predict(X_tes) cnf_matrix = confusion_matrix(y_tes, y_pred) np.set_printoptions(precision=2) print(cnf_matrix) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') fileName = "Plot_wnr-%04d.pdf" % counter plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') fileName = "Plot_nr-%04d.pdf" % counter plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') fileName = "Plot_cr_w-%04d.pdf" % counter plot_classification_report(classification_report(y_pred, y_tes)) plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') plt.close() counter = 0 estimator = svm.SVC(kernel='rbf', gamma=0.027826) classifier = estimator cv = cross_validation.KFold(22165, n_folds=10, shuffle=False, random_state=None) class_names = ['i', 'o', 'P', 'L'] for train_index, test_index in cv: counter = counter + 1 X_tr, X_tes = vec_train[train_index], vec_train[test_index] y_tr, y_tes = target[train_index], target[test_index] clf = classifier.fit(X_tr, y_tr) y_pred = clf.predict(X_tes) cnf_matrix = confusion_matrix(y_tes, y_pred) np.set_printoptions(precision=2) print(cnf_matrix) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') fileName = "uw_Plot_wnr-%04d.pdf" % counter plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') fileName = "uw_Plot_nr-%04d.pdf" % counter plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') fileName = "Plot_cr_uw-%04d.pdf" % counter plot_classification_report(classification_report(y_pred, y_tes)) plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight') plt.close()
# В предыдущей ячейке мы разделили наш датасет ещё на две части: в одной присутствуют только вещественные признаки, в другой только категориальные. Это понадобится нам для раздельной последующей обработке этих данных, а так же для сравнения качества работы тех или иных методов. # # Для использования модели регрессии требуется преобразовать категориальные признаки в вещественные. Рассмотрим основной способ преоборазования категориальных признаков в вещественные: one-hot encoding. Его идея заключается в том, что мы преобразуем категориальный признак при помощи бинарного кода: каждой категории ставим в соответствие набор из нулей и единиц. # # Посмотрим, как данный метод работает на простом наборе данных. # In[ ]: from sklearn.linear_model import LogisticRegression as LR from sklearn.feature_extraction import DictVectorizer as DV categorial_data = pd.DataFrame({'sex': ['male', 'female', 'male', 'female'], 'nationality': ['American', 'European', 'Asian', 'European']}) print('Исходные данные:\n') print(categorial_data) encoder = DV(sparse = False) encoded_data = encoder.fit_transform(categorial_data.T.to_dict().values()) print('\nЗакодированные данные:\n') print(encoded_data) # Как видно, в первые три колонки оказалась закодированна информация о стране, а во вторые две - о поле. При этом для совпадающих элементов выборки строки будут полностью совпадать. Также из примера видно, что кодирование признаков сильно увеличивает их количество, но полностью сохраняет информацию, в том числе о наличии пропущенных значений (их наличие просто становится одним из бинарных признаков в преобразованных данных). # # Теперь применим one-hot encoding к категориальным признакам из исходного датасета. Обратите внимание на общий для всех методов преобработки данных интерфейс. Функция # # encoder.fit_transform(X) # # позволяет вычислить необходимые параметры преобразования, впоследствии к новым данным можно уже применять функцию # # encoder.transform(X) #
def _encode_feature(self, splited_key, train_data, test_data, external_data, logger): """ feature transfer and encoding """ # encode y logger.debug('splited_key[%s] encode y' % splited_key) (train_y, le_y) = self._encode_y(train_data['OutcomeType'].values, logger) (total_breed, total_color) = self._generate_combine_data(train_data, test_data, logger) test_data.rename(columns={'ID': 'AnimalID'}, inplace=True) feature_columns = test_data.columns feature_train_data = train_data[feature_columns] feature_train_data.loc[:, 'data_type'] = 'train' feature_test_data = test_data[feature_columns] feature_test_data.loc[:, 'data_type'] = 'test' logger.debug('feature_train_data columns %s' % str(feature_train_data.columns)) logger.debug('feature_test_data columns %s' % str(feature_test_data.columns)) data = pd.concat([feature_train_data, feature_test_data]) logger.debug('feature_train_data shape %s' % str(feature_train_data.shape)) logger.debug('feature_test_data shape %s' % str(feature_test_data.shape)) logger.debug('data shape %s' % str(data.shape)) logger.debug('splited_key[%s] encode x' % splited_key) data['EncodeYear'] = data['DateTime'].apply(self._transfer_year_info) data['EncodeMonth'] = data['DateTime'].apply(self._transfer_month_info) data['EncodeWeekday'] = data['DateTime'].apply( self._transfer_weekday_info) data['EncodeHour'] = data['DateTime'].apply(self._transfer_hour_info) data['UnixDateTime'] = data['DateTime'].apply( self._transfer_unix_datetime_info) data['EncodeAgeuponOutcome'] = data['AgeuponOutcome'].apply( self._transfer_age_info) data = data[data['SexuponOutcome'] != ''] data['NameLen'] = data['Name'].apply(self._transfer_name_len) if self._encode_type == 'dv': for breed_type in total_breed: data[breed_type] = data['Breed'].apply( self._transfer_breed_type_info, args=(breed_type, )) for color_type in total_color: data[color_type] = data['Color'].apply( self._transfer_color_type_info, args=(color_type, )) data['BreedMix'] = data['Breed'].apply(self._transfer_breed_mix_info) data['ColorCount'] = data['Color'].apply( self._transfer_color_count_info) data['Sex'] = data['SexuponOutcome'].apply(self._transfer_sex_info) data['Intact'] = data['SexuponOutcome'].apply( self._transfer_intact_info) logger.debug('transfer feature_train_data shape %s' % str(feature_train_data.shape)) logger.debug('transfer feature_test_data shape %s' % str(feature_test_data.shape)) drop_list = [ 'AnimalID', 'Name', 'DateTime', 'AgeuponOutcome', 'SexuponOutcome' ] data = data.drop(drop_list, 1) transfer_train_data = data[data['data_type'] == 'train'] transfer_test_data = data[data['data_type'] == 'test'] type_drop_list = ['data_type'] transfer_train_data = transfer_train_data.drop(type_drop_list, 1) transfer_test_data = transfer_test_data.drop(type_drop_list, 1) data = data.drop(type_drop_list, 1) if self._encode_type == 'dv': # one-hot encoder x_all = data.T.to_dict().values() vectorizer_x = DV(sparse=False) vectorizer_x.fit(x_all) x1 = transfer_train_data.T.to_dict().values() train_x = pd.DataFrame(vectorizer_x.fit_transform(x1)) x2 = transfer_test_data.T.to_dict().values() test_x = pd.DataFrame(vectorizer_x.transform(x2)) model_infos = {'vectorizer_x': vectorizer_x, 'le_y': le_y} elif self._encode_type == 'label': # label encode col_le_dict = self._fit(data, logger) train_x = self._transform(transfer_train_data, col_le_dict, logger) test_x = self._transform(transfer_test_data, col_le_dict, logger) model_infos = {'col_le_dict': col_le_dict, 'le_y': le_y} else: raise ValueError("encode_type not valid, [label, dv] supported") logger.debug('splited_key[%s] train_x shape %s' % (splited_key, str(train_x.shape))) logger.debug('splited_key[%s] train_y shape %s' % (splited_key, str(train_y.shape))) logger.debug('splited_key[%s] test_x shape %s' % (splited_key, str(test_x.shape))) return (train_x, train_y, test_x, model_infos)
#activity_category one hot encoding ac_cnt = train_input.activity_category.value_counts(normalize=True) ac_cat_d = defaultdict(lambda: 'a0') ac_cols = ac_cnt.index.values ac_cols.sort() for k, v in enumerate(ac_cols): ac_cat_d[v] = 'a' + str(k + 1) print ac_cat_d train_input['activity_category'].replace(ac_cat_d, inplace=True) # train_input['activity_category'].head() # DictVectorizer - string to one-hot encoding train_dict = train_input[['activity_category', 'char_10']].T.to_dict().values() train_input = train_input.drop(['activity_category', 'char_10'], axis=1) train_vectorizer = DV(sparse=False) vec_train_feat = train_vectorizer.fit_transform(train_dict) print type(vec_train_feat) print vec_train_feat[0:5, :] train_df = pd.concat([ train_input[['outcome', 'people_id']], pd.DataFrame(vec_train_feat, dtype=int) ], axis=1) print train_df.shape print train_df.head() # test dataset - one-hot encoding test_input = pd.read_csv('dataset/act_test_t.csv', keep_default_na=True).fillna("-1") test_input['char_10'].replace(c10_cat_d, inplace=True)
def adjustResponse(resp): if resp < 1: return 1 elif resp > 8: return 8 else: return int(round(resp)) os.chdir("/Users/swapnil/work/Kaggle/out/PLIA") print "Hello" data = pd.read_csv("trans_train_sumMK.csv") test = pd.read_csv("trans_train_sumMK.csv", na_values="NA") d = DV(sparse=True) data = data.fillna(-9999) test = test.fillna(-9999) trainData, cvData, yTrain, yCv = train_test_split(data, data["Response"], test_size=0.2, random_state=42) trainData = trainData.drop("Response", axis=1) trainData = trainData.drop("Id", axis=1) cvData = cvData.drop("Response", axis=1) cvData = cvData.drop("Id", axis=1)