def get_model(input_dim): logger.debug(f'The input size for DNN is:{input_dim}') dropout= 0.5 lr =0.0001 model = Sequential() model.add(Dense(int(input_dim*1.5), input_shape=(input_dim,))) # model.add(Dropout(dropout)) # model.add(Dense(100)) # model.add(LeakyReLU(alpha=0.01)) # model.add(BatchNormalization()) # model.add(Dropout(dropout)) model.add(Dense(1, kernel_initializer='normal')) # model.compile(optimizer="sgd", loss="mse") adam = Adam(lr=lr) model.compile(loss='mean_squared_error', optimizer=adam, #metrics=['categorical_crossentropy'], ) model.summary() return model
def get_feature_label_dnn(version, ensemble): from code_felix.tiny.util import get_stable_feature feature_label = get_stable_feature(version) if ensemble: file_list = [ #0.2 ('xgb_age', './sub/baseline_1.999298_3194_xgb_age_.h5'), ('xgb', './sub/baseline_2.606958_2666_xgb_1632_.h5'), ('lgb', './sub/baseline_2.61447_294_lgb_.h5'), #('sex_xgb', './output/best/baseline_0.653098_2794_xgb_sex_0.95.h5'), # # # # # ('lgb', './output/best/baseline_2.62099_287_lgb_min_data_in_leaf1472.h5'), # ('dnn', './output/best/baseline_2.613028_2631_xgb_1615_svd_cmp0.h5'), ] feature_label = ensemble_feature_other_model(feature_label, file_list) feature_label['sex'] = feature_label['sex'].astype('category') feature_label['age'] = feature_label['age'].astype('category') feature_label['sex_age'] = feature_label['sex_age'].astype('category') logger.debug(f"type of sex is {feature_label['sex'].dtype}") return feature_label
def get_feature(hours=4): raw = get_raw_input() if 'time' in raw: del raw['time'] logger.debug(raw.shape) report = get_report() logger.debug(f"The shape of the report is {report.shape}") from code_felix.feature.config import time_interval gap = 3600*hours//time_interval columns = raw.columns final_columns = [] for i in range(0, gap): final_columns.extend([f'{item}#{i}' for item in columns]) feature = np.zeros(( len(report), len(final_columns))) for i, end in enumerate(report.index): begin = end - gap + 1 #logger.debug(f'{begin}:{end}') #logger.debug(raw.loc[begin:end, :].values.shape) feature[i] = np.round(raw.loc[begin:end, :].values.flatten(),6) return pd.DataFrame(feature, columns=final_columns,index=report.index)
def predict(self, X_test): classifier = models.load_model(self.best_model) y_test = classifier.predict(X_test) logger.debug(f"y_test:{y_test.shape}") return y_test[:, 0]
def transform(self, X, y=None): # if X.name in ('gap_avg', 'gap_pre'): # logger.debug('No need to fill missing value for %s' % X.name) # return X # else: logger.debug("Try to fill %s with value %s" % (X.name, self.fill)) X = X.replace([numpy.inf, -numpy.inf], numpy.nan) #X = X.apply(lambda x: self.fill if str(x).lower() in [' ', 'null', 'na'] else x) return X.fillna(self.fill)
def _reduce_mem_usage(df, verbose=True): if isinstance(df, pd.Series): return df numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] mem = df.memory_usage() mem = mem if isinstance(mem, (int, float)) else mem.sum() start_mem = mem / 1024**2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo( np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) mem = df.memory_usage() mem = mem if isinstance(mem, (int, float)) else mem.sum() end_mem = mem / 1024**2 if verbose: logger.debug( 'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)' .format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df
def get_raw_input(): rootdir = ['./input/fix_interval/生产参数记录表(固定时间间隔)-2018年4月' , './input/fix_interval/生产参数记录表(固定时间间隔)-2018年5月' , ] df_month_list = [] drop_time = False for cur_dir, month in zip(rootdir, [4, 5]): list = os.listdir(cur_dir) list = [f'{cur_dir}/{item}' for item in list if 'csv' in item] #logger.debug(f'=====File in one of the folder#{month}:{list}') #logger.debug(month) df_list = [] for file in list: # logger.debug(file) f = open(file) file_name = os.path.basename(file) logger.debug(f'Get {file_name} base on {file}') file_sn_dict = get_file_order() file_sn = file_sn_dict[file_name] df = pd.read_csv(f, header=None, ) df.columns = ['time_sn', 'time', f'val#{str(file_sn).rjust(2,"0")}'] #df['month'] = month if month==5: df.time_sn = df.time_sn+518400 df.set_index('time_sn',inplace=True) if drop_time: df.drop(['time'], axis=1, inplace=True) drop_time = True df_list.append(df) #logger.debug(df.shape) one_month = pd.concat(df_list, axis=1) df_month_list.append(one_month) all = pd.concat(df_month_list) all.sort_index(axis=1, inplace=True) return all
def fit(self, X_train, y_train, X_valid, y_valid): check_best = ModelCheckpoint(filepath=self.best_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stop = EarlyStopping( monitor='val_loss', verbose=1, patience=20, ) reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, verbose=1, mode='min') model = self.get_dnn_model(self.model_type, self.input_dim, self.dropout) history = model.fit( X_train, y_train, validation_data=(X_valid, y_valid), callbacks=[check_best, early_stop, reduce], batch_size=128, #steps_per_epoch= len(X_test)//128, epochs=100, verbose=1, ) best_epoch = np.array(history.history['val_loss']).argmin() + 1 best_score = np.array(history.history['val_loss']).min() logger.debug( f'Best model save to:{self.best_model}, input:{X_train.shape}, bets_epoch:{best_epoch}, best_score:{best_score}' )
def fit(self, X, y=None): # logger.debug(X.name, X.dtype) # logger.debug("SeriesImputer:%s for %s:%s" % (id(self), X.name, X.dtype) ) if X.name == 'renewed_yorn': self.fill = 'null' elif 'datetime' in X.dtype.name: logger.warning( "%s will be fill missing value with None as datetime" % X.name) self.fill = numpy.nan elif X.dtype.name in ['object', 'category']: if len(X.value_counts()) == 0: self.fill = numpy.nan else: self.fill = X.value_counts().index[0] logger.debug("Fill %s with value:%s, type:%s, count:%d" % (X.name, self.fill, X.dtype, len(X.unique()))) else: mean = X.mean() self.fill = 0 if numpy.math.isnan(mean) else mean #self.fill = X.mean() logger.debug("Fill %s with value:%s, type:%s" % (X.name, self.fill, X.dtype)) return self
def convert_missing(sample): imputer = defaultdict(SeriesImputer) train_temp = sample[sample.label_del == 'train'] logger.debug("Begin to fill the missing data base on %s/%s" % (len(train_temp), len(sample))) logger.debug("try to get the fill value base on %d training data" % len(train_temp)) train_temp = train_temp.apply(lambda x: imputer[x.name].fit(x), reduce=False) del train_temp temp_list = sorted(imputer.items(), key=lambda item: sample[item[0]].dtype.name) sample = sample.apply(lambda x: imputer[x.name].transform(x), reduce=False) logger.debug("There are %d columns already fill the missing value" % len(temp_list)) logger.debug("End clean the data" + str(sample.shape)) return sample
def learning(model ,Xtrain ,y ,Xtest,label_name, number_of_folds= 5, seed = 777): train_index = Xtrain.index test_index = Xtest.index Xtrain = Xtrain.reset_index(drop=True) Xtest = Xtest.reset_index(drop=True) y = y.reset_index(drop=True) y_train = y logger.debug(f'train:{Xtrain.shape}, label:{y.shape}, test:{Xtest.shape}') print( 'Model: %s' % model) """ Each model iteration """ train_predict_y = np.zeros((len(y))) test_predict_y = np.zeros((Xtest.shape[0])) learn_loss = 0. """ Important to set seed """ tmp_model = './output//model/checkpoint/dnn_best_tmp.hdf5' check_best = ModelCheckpoint(filepath=tmp_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stop = EarlyStopping(monitor='val_loss', verbose=1, patience=100, ) reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=30, verbose=1, mode='min') logger.debug(f'y_train.shape:{y_train.shape}') from sklearn.model_selection import KFold skf = KFold(n_splits = number_of_folds ,shuffle=True, random_state=seed) """ Each fold cross validation """ for i, (train_idx, val_idx) in enumerate(skf.split(Xtrain)): logger.debug(f'Fold#{i + 1}' ) #print(train_idx) history = model.fit(Xtrain.values[train_idx], y[train_idx], validation_data=((Xtrain.values[val_idx], y[val_idx])), #callbacks=[check_best, early_stop, reduce], batch_size=8, # steps_per_epoch= len(X_test)//128, epochs=2, verbose=1, ) best_epoch = np.array(history.history['val_loss']).argmin() + 1 best_score = np.array(history.history['val_loss']).min() logger.debug(f"Fold#{i+1} arrive {best_score} at {best_epoch}") scoring = model.predict(Xtrain.values[val_idx]) #bst.predict(data.data, ntree_limit=bst.best_ntree_limit) """ Out of fold prediction """ train_predict_y[val_idx] = scoring l_score = mean_squared_error(y[val_idx], scoring) learn_loss += l_score logger.debug('Fold %d score: %f' % (i + 1, l_score)) test_predict_y = test_predict_y + model.predict(Xtest.values) test_predict_y = test_predict_y / number_of_folds avg_loss = round(learn_loss / number_of_folds, 6) print('average val log_loss: %f' % (avg_loss)) """ Fit Whole Data and predict """ print('training whole data for test prediction...') # np.save('./output/xgb_train.np', train_predict_y) # np.save('./output/xgb_test.np', test_predict_y) ###Save result for ensemble train_bk = pd.DataFrame(train_predict_y, index=train_index, columns=[label_name] ) test_bk = pd.DataFrame(test_predict_y, index=test_index, columns=[label_name] ) label_bk = pd.DataFrame({'label': y}, columns=[label_name], index=train_index, ) model_name = type(model).__name__ save_result_for_ensemble(f'kfold_{label_name}_{model_name}_{avg_loss}', label_name=label_name, train=train_bk, test=test_bk, label=label_bk, )
def get_all_file(path): logger.debug(f'Try to read file from"{path}') file_list = os.listdir(path) file_list = [file for file in file_list if '.h5' in file] return file_list
def get_train(hours_gap): train = get_feature(hours_gap)[:144] logger.debug(f'The size of train is:{train.shape}') return train
def get_file_order(): from code_felix.feature.config import file_order file_sn = dict(zip(file_order, range(0, len(file_order)))) file_sn_list = sorted(file_sn.items(), key=lambda val: val[1]) logger.debug(file_sn_list) return file_sn
def train_dnn(dropout, lr, ensemble): #dropout = 0.7 version = '1011' args = locals() logger.debug(f'Run train dnn:{args}') feature_label = get_feature_label_dnn(version, ensemble) test = feature_label[feature_label['sex'].isnull()] train = feature_label[feature_label['sex'].notnull()] logger.debug(f"type of sex is {feature_label['sex'].dtype}") X_train, X_test, y_train, y_test = split_train(train) input_dim = X_train.shape[1] logger.debug( f'X_train:{X_train.shape}, y_train:{y_train.shape}, score:{test.shape}, input_dim:{input_dim}' ) model = Sequential() model.add(Dense(1200, input_shape=(input_dim, ))) #model.add(Activation('sigmoid')) model.add(LeakyReLU(alpha=0.01)) model.add(Dropout(dropout)) model.add(Dense(100)) model.add(LeakyReLU(alpha=0.01)) model.add(BatchNormalization()) model.add(Dropout(dropout)) model.add(Dense(15, )) model.add(LeakyReLU(alpha=0.01)) model.add(Dense(22, )) model.add(Activation('softmax')) # model.compile(optimizer="sgd", loss="mse") adam = Adam(lr=lr) model.compile( loss='categorical_crossentropy', optimizer=adam, #metrics=['categorical_crossentropy'], ) model.summary() #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy]) tmp_model = './model/checkpoint/dnn_best_tmp.hdf5' check_best = ModelCheckpoint(filepath=tmp_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stop = EarlyStopping( monitor='val_loss', verbose=1, patience=100, ) reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=30, verbose=1, mode='min') from keras.utils import np_utils logger.debug(f'y_train.shape:{y_train.shape}') history = model.fit( X_train, np_utils.to_categorical(y_train), validation_data=(X_test, np_utils.to_categorical(y_test)), callbacks=[check_best, early_stop, reduce], batch_size=128, #steps_per_epoch= len(X_test)//128, epochs=50000, verbose=1, ) best_epoch = np.array(history.history['val_loss']).argmin() + 1 best_score = np.array(history.history['val_loss']).min() classifier = models.load_model(tmp_model) pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1) logger.debug(f'Test:{test.shape}, pre_x:{pre_x.shape}') logger.debug(f'pre_x.values:{pre_x.values.shape}') sub = pd.DataFrame(classifier.predict_proba(pre_x.values)) sub.columns = train.sex_age.cat.categories sub['DeviceID'] = test['device'].values sub = sub[[ 'DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10' ]] logger.debug(f'best_score(his):{best_score} @ epoch:{best_epoch}') model_file = f'./model/checkpoint/dnn_best_{best_score}_{args}_epoch_{best_epoch}.hdf5' model.save(model_file, overwrite=True) print( f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}' ) # file = f'./sub/baseline_dnn_{best_score}_{args}_epoch_{best_epoch}.csv' # # file = replace_invalid_filename_char(file) # logger.info(f'sub file save to {file}') # sub = round(sub, 10) # sub.to_csv(file, index=False) ###Save result for ensemble train_bk = pd.DataFrame(classifier.predict_proba( train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)), index=train.device, columns=train.sex_age.cat.categories) test_bk = pd.DataFrame(classifier.predict_proba(pre_x.values), index=test.device, columns=test.sex_age.cat.categories) label_bk = pd.DataFrame( {'label': train.sex_age.cat.codes}, index=train.device, ) from code_felix.tiny.util import save_result_for_ensemble save_result_for_ensemble( f'{round(best_score,5)}_{best_epoch}_v_{version}_dnn_{args}', train=train_bk, test=test_bk, label=label_bk, )