def load_data(train_pkl, test_pkl, label='is_y2', method=None, target_pn_ratio=None, seed=None): train_data = pd.read_pickle(train_pkl) test_data = pd.read_pickle(test_pkl) print( colorize( 'train-shape={}\t test_shape={}'.format(train_data.shape, test_data.shape), 'blue', True)) print(train_data.head()) print(test_data.head()) pn_ratio = sum(train_data.is_y2 == 1) / sum(train_data.is_y2 == 0) print(colorize('naive-pn-ratio={:.4f}'.format(pn_ratio), 'blue', True)) if target_pn_ratio: method = method or 'up' train_data = train_sampling(train_data, col=label, method=method, pn_ratio=target_pn_ratio, seed=seed) train_data = shuffle(train_data, random_state=42) print('shuffle:\n', train_data.head(20)) print(colorize('train-shape={}'.format(train_data.shape), 'blue', True)) train_y = train_data[label] train_x = train_data.drop(columns=[label]) test_y = test_data[label] test_x = test_data.drop(columns=[label]) assert train_x.isna().sum().sum() == 0 return train_x, train_y, test_x, test_y
def _encoder_column(self, data, prefix, prefix_sep, dtype): if dtype is None: dtype = np.uint8 maps = self.mapping.get(prefix, {}) or self.tail_mapping.get( prefix, {}) dummy_strs = cycle([u'{prefix}{sep}{val}']) dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, val=str(v)) for dummy_str, v in zip(dummy_strs, maps.keys()) ] out_shape = (len(data), len(dummy_cols)) if isinstance(data, Series): index = data.index else: index = None data.reset_index(drop=True, inplace=True) # data :Series data2 = data.map(maps) null = data2[data2.isna()].index data2 = data2[data2.notna()] if not null.empty: print( colorize( "{} only exist in test data column '{}'".format( set(data[null].values), prefix), 'cyan', True)) row_idxs = data2.index.tolist() col_idxs = data2.values.tolist() sarr = csr_matrix((np.ones(len(row_idxs)), (row_idxs, col_idxs)), shape=out_shape, dtype=dtype) if pd.__version__ >= '0.25': out = pd.DataFrame.sparse.from_spmatrix( sarr, index=index, columns=dummy_cols) #sparse accessor, out.sparse.to_dense() # dense.astype('Sparse[int]'), dense.astype(pd.SparseDtype(int,fill_value=0)) # out.astype(pd.SparseDtype(int, fill_value=0)) else: out = pd.SparseDataFrame(sarr, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out.astype(dtype) # care of row and columns not covered by sarr
train_data = train_sampling(train_data, col='is_y2', method='down', pn_ratio=0.2, seed=2019) # train_data = train_sampling(train_data, col='is_y2', method='up', pn_ratio=0.5,seed=2019) pn_ratio = sum(train_data.is_y2 == 1) / sum(train_data.is_y2 == 0) print(train_data.head()) print(val_data.head()) train_data = train_data.values np.random.shuffle(train_data) # np.random.shuffle(train_data) n_state = train_data.shape[1] - 1 n_action = 1 print(colorize('pn-ratio={}'.format(pn_ratio), 'cyan', True)) print( colorize('action_dim=%d, state_dim=%d' % (n_action, n_state), 'cyan', True)) print( colorize( 'train_shape={}, val_shape={}'.format(train_data.shape, val_data.shape), 'cyan', True)) checkpoint_queen = MinHeap(max_size=5, compare_key=operator.itemgetter(0)) logger = Logger(output_dir='../assets', output_fname='ddpg_epoch_log') config = get_session_config(frac=0.4, allow_growth=True, gpu="0") ddpg = DDPG(n_state=n_state, n_action=n_action,
net.optimizer = Adam( lr=0.001) # Adam(lr=0.001) SGD(lr=0.001,nesterov=False) history = net.fit(train_x.values, train_y.values, batch_size=64, epochs=200, verbose=1, shuffle=True, validation_data=(val_x.values, val_y.values), class_weight=None) df = pd.DataFrame(history.history) df.to_csv('../assets/misc_' + datetime.now().strftime('%m%d_%H%M') + '.csv', index=False) print(colorize('done'.center(50, '-'), 'green', True)) # model = Sequential() # l1 =0 # l2 =0 # model.add(Dense(units=64, activation='relu',input_dim=train_x.shape[1], # kernel_regularizer=None, #regularizers.l1_l2(l1=l1, l2=l2) # kernel_initializer = 'he_normal',name='fc1')) # # model.add(Dense(units=64, activation=None, kernel_regularizer=None, # kernel_initializer='he_normal',name='fc2')) # model.add(BatchNormalization()) # model.add(Activation(activation='relu')) # # model.add(Dense(units=64, activation=None, kernel_regularizer=None, # kernel_initializer='he_normal'))
def fit(self, X, y=None, cols_to_encode=None, extra_numeric_cols=None): """ parameter ---------- X: DataFrame to generate one-hot-encoder rule y: label column in DataFrame X if provided cols_to_encoders: specify the columns to be encoded extra_numeric_cols: if cols_to_encoder is provided this param will not be used, otherwise all object columns and extra_numeric_cols will be encoded. """ print('fitting....') assert isinstance(X, DataFrame), 'X should be DataFrame object' columns = X.columns.tolist() if y is not None: if y not in columns: raise ValueError('y is not in X.columns during {}.fit'.format( self.__class__.__name__)) else: columns.remove(y) self._dim = len(columns) # drop null cols nulls = X.isnull().sum(axis=0) / len(X) drop_null_cols = nulls[nulls >= self.drop_na_ratio].index.tolist() X = X.drop(columns=drop_null_cols) self.drop_cols.extend(drop_null_cols) print( colorize( 'drop_null_cols({})={}'.format(len(drop_null_cols), drop_null_cols), 'blue', True)) # get encoder columns if cols_to_encode is None: cols = self.get_encode_cols(X) cols += list(extra_numeric_cols) if extra_numeric_cols else [] else: cols = cols_to_encode if y in cols: cols.remove(y) cols = sorted(list(set(cols)), key=columns.index) # convert na to sentinel value df = X[cols].fillna(self.na_sentinel, downcast='infer').astype(str) # generate rules colvals = {} drop_cat_cols = [] for col in cols: values = df[col].unique().tolist() if str(self.na_sentinel) in values and not self.dummy_na: values.remove(str(self.na_sentinel)) if 1 < len(values) <= self.category_threshold: colvals[col] = values else: drop_cat_cols.append(col) print( colorize( 'drop_cat_cols({})={}'.format(len(drop_cat_cols), drop_cat_cols), 'blue', True)) self.drop_cols.extend(drop_cat_cols) self.encode_cols = list(sorted(colvals.keys(), key=df.columns.get_loc)) self.double_cols = [ col for col in columns if col not in self.encode_cols and col not in self.drop_cols ] for col in self.encode_cols: vals = colvals[col] self.mapping[col] = OrderedDict( {val: i for i, val in enumerate(vals)}) # cats = df.apply(lambda x: x.unique().__len__(), axis=0) # subs = 0 if self.dummy_na else df.apply(lambda x: str(self.na_sentinel) in x.values) # cats -= subs # self.drop_cols = cats[(cats>=self.category_threshold)|(cats<=1)].index.tolist() # self.encode_cols = cats[~cats.index.isin(self.drop_cols)].index.tolist() # cats.index.difference(drop_cols), turns changed # self.double_cols = [col for col in columns if col not in self.encode_cols and col not in self.drop_cols] # long-tail-distribution(in double cols) if self.long_tail_preproc is not None: skews = X[self.double_cols].skew(skipna=True) thres = self.kwargs.get('skew_threshold', 5) lt_cols = skews[abs(skews) > thres].index.tolist() print( colorize('long-tail-cols({})={}'.format(len(lt_cols), lt_cols), 'blue', True)) if self.long_tail_preproc == 'discretize': drop_tail_cols = [] self.tail_bins = {} self.tail_mapping = {} self.long_tail_cols = [] self.buckets = self.kwargs.get('buckets', 5) self.labels = list(map(chr, ord('a') + np.arange(self.buckets))) for col in lt_cols: if X[col].unique().size < self.buckets: drop_tail_cols.append(col) continue _, bins = pd.qcut(X[col], q=self.buckets, labels=None, retbins=True, duplicates='drop') if bins.size < 3: # at least 2 bins drop_tail_cols.append(col) continue self.tail_bins[col] = bins.tolist() self.tail_mapping[col] = OrderedDict( {chr(ord('a') + i): i for i in range(bins.size - 1)}) if (X[col].isna().sum() / len(X)) > 0.01: self.tail_mapping[col]['null'] = bins.size - 1 self.double_cols.remove(col) self.long_tail_cols.append(col) self.drop_cols.extend(drop_tail_cols) self.encode_cols.extend(self.long_tail_cols) for col in drop_tail_cols: self.double_cols.remove(col) print( colorize( 'drop_tail_cols({})={}'.format(len(drop_tail_cols), drop_tail_cols), 'blue', True)) else: # TODO(yuanyuqing163): implement boxcox transformation raise ValueError( "boxcox transformation hasn't implemented yet") self.scaler['mean'] = X[self.double_cols].mean() self.scaler['std'] = X[self.double_cols].std() if 'minmax' in self.double_preproc: self.scaler['min'] = X[self.double_cols].min() self.scaler['max'] = X[self.double_cols].max() elif 'normal' not in self.double_preproc: raise ValueError( 'double_process_type = {} not supported yet'.format( self.double_preproc)) return self
def transform(self, X, y=None, dtype=None, inplace=False): """ parameter ----------- dtype: specifies the dtype of encoded value """ print('transform....') assert isinstance(X, DataFrame), 'X shoule be DataFrame object' columns = X.columns.tolist() target_df = [] if y is not None: if y not in columns: raise ValueError("'y label {}' not in X".format(y)) else: columns.remove(y) target_df = [X.loc[:, [y]]] assert self._dim == len(columns) diff_cols = set(self.encode_cols + self.double_cols).difference(columns) if len(diff_cols) > 0: raise ValueError( "X not includes encoded columns '{}'".format(diff_cols)) if not inplace: X = X.copy() # X=X.copy(deep=True) gc.collect() X.drop(self.drop_cols, axis=1, inplace=True) X[self.double_cols] = X[self.double_cols].fillna(self.scaler['mean']) if 'normal' in self.double_preproc: X[self.double_cols] = (X[self.double_cols] - self.scaler['mean'] ) / (self.scaler['std'] + self.epsilon) #TODO:truncate interval [min,max] if 'minmax' in self.double_preproc: lbound = self.kwargs.get('lbound', 0) hbound = self.kwargs.get('hbould', 1) X[self.double_cols] = lbound + ( X[self.double_cols] - self.scaler['min'] ) / (self.scaler['max'] - self.scaler['min']) * (hbound - lbound) # long_tail_feature if self.long_tail_preproc == 'discretize': print('long_tail_discretize.....') for col in self.long_tail_cols: buckets = len(self.tail_bins[col]) - 1 idx = list(range(buckets + 2)) val = [ self.labels[0], *self.labels[:buckets], self.labels[buckets - 1] ] idx2val = dict(zip(idx, val)) X[col] = X[col].map( lambda x: np.searchsorted(self.tail_bins[col], x), na_action='ignore') X[col] = X[col].map(idx2val) if 'null' in self.tail_mapping[col]: X[col] = X[col].fillna('null') else: print( colorize( "'null' not long tail in '{}'=#{}".format( col, X[col].isna().sum()), 'cyan', True)) elif self.long_tail_preproc == 'boxcox': raise ValueError('unsupported long_tail_preproc type'.format( self.long_tail_preproc)) data_to_encode = X[self.encode_cols].fillna( self.na_sentinel, downcast='infer').astype(str) with_dummies = [X[self.double_cols]] prefix = self.encode_cols prefix_sep = cycle(['_']) print('encoding....') for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): # col is (col_name, col_series) type dummy = self._encoder_column(col[1], pre, sep, dtype=dtype) with_dummies.append(dummy) result = pd.concat(with_dummies + target_df, axis=1) return result