def get_train_test_dp_ids(dataset_name): db_name = None if 'acquirevaluedshopperschallenge' in dataset_name: db_name = 'acquirevaluedshopperschallenge' elif 'homecreditdefaultrisk' in dataset_name: db_name = 'homecreditdefaultrisk' elif 'kddcup2014' in dataset_name: db_name = 'kddcup2014' if db_name is not None: db_info = get_db_info(db_name) train_dp_ids = db_info['train_dp_ids'] test_dp_ids = db_info['test_dp_ids'] else: ds_info = get_ds_info(dataset_name) n_datapoints = ds_info['meta']['n_datapoints'] train_dp_ids = np.arange(n_datapoints) test_dp_ids = None return train_dp_ids, test_dp_ids
def __init__(self, dataset_name=None, datapoint_ids=None, encoders=None): self.ds_name = dataset_name self.datapoint_ids = datapoint_ids self.encoders = encoders self.ds_info = get_ds_info(dataset_name) raw_data_path = os.path.join(data_root, self.ds_info['processed']['local_path']) if 'acquirevaluedshopperschallenge' in dataset_name: self.raw_data = pd.read_csv(raw_data_path) self.raw_data.set_index('id', inplace=True) elif 'homecreditdefaultrisk' in dataset_name: self.raw_data = pd.read_csv(raw_data_path) self.raw_data.set_index('SK_ID_CURR', inplace=True) elif 'kddcup2014' in dataset_name: self.raw_data = pd.read_csv(raw_data_path) self.raw_data.set_index('projectid', inplace=True) else: col_names = [c['name'] for c in self.ds_info['meta']['columns']] self.raw_data = pd.read_csv(raw_data_path, header=None, names=col_names) if datapoint_ids is not None: self.raw_data = self.raw_data.loc[datapoint_ids] if self.ds_info['processed']['task'] == 'regression': targets = np.array(self.raw_data['TARGET']).astype(np.float) self.targets = torch.Tensor(targets) else: def tfm(x): if x in ['0', '-1', '0.0', 'no', 'No', 'neg', 'n', 'N', 'False', 'NRB', ' <=50K']: return 0 elif x == 'nan': return pd.np.nan else: return 1 targets = self.raw_data['TARGET'].astype(str).transform(tfm) targets = np.array(targets).astype(np.float) self.targets = torch.LongTensor(targets) self.raw_data = self.raw_data[[i for i in self.raw_data.columns if i != 'TARGET']] self.columns = self.ds_info['meta']['columns'][1:] # Omitting the target column self.cat_feat_origin_cards = None self.cont_feat_origin = None self.feature_encoders = None
def __init__(self, writer, dataset_name, n_cont_features, cat_feat_origin_cards, max_emb_dim, activation_class_name, activation_class_kwargs, norm_class_name, norm_class_kwargs, p_dropout, one_hot_embeddings, drop_whole_embeddings, loss_class_name=None, loss_class_kwargs=None, n_out=None): super().__init__() self.writer = writer if dataset_name is not None: assert n_out is None self.ds_info = get_ds_info(dataset_name) task = self.ds_info['processed']['task'] if task == 'binary classification': self.n_out = 2 elif task == 'multiclass classification': raise NotImplementedError # todo elif task == 'regression': self.n_out = 1 self.act_on_output = False else: assert n_out is not None self.n_out = n_out self.act_on_output = True self.n_cont_features = n_cont_features self.cat_feat_origin_cards = cat_feat_origin_cards self.p_dropout = p_dropout self.drop_whole_embeddings = drop_whole_embeddings self.one_hot_embeddings = one_hot_embeddings self.act_class = activations.__dict__[activation_class_name] self.act_class_kwargs = activation_class_kwargs self.norm_class = nn.__dict__[norm_class_name] self.norm_class_kwargs = norm_class_kwargs self.loss_class_name = loss_class_name self.loss_class_kwargs = loss_class_kwargs self.cat_initializers = nn.ModuleDict() if isinstance(self.cat_feat_origin_cards, list): for col_name, card in self.cat_feat_origin_cards: self.cat_initializers[col_name] = EmbeddingInitializer(card, max_emb_dim, p_dropout, drop_whole_embeddings=drop_whole_embeddings, one_hot=one_hot_embeddings) self.init_feat_dim = sum(i.emb_dim for i in self.cat_initializers.values()) + self.n_cont_features
def get_kwargs(ds_name): ds_info = get_ds_info(ds_name) n_datapoints = ds_info['meta']['n_datapoints'] n_columns = len(ds_info['meta']['columns']) weight_decay = 0.01 ###################### # Basic kwargs epochs = 500 max_batch = 1024 batch_size = min(n_datapoints // 30, max_batch) if batch_size == max_batch: batch_size // max(1, math.log10(n_columns) - 1) kwargs = dict(seed=1234, debug_network=False, encoders=dict(CATEGORICAL='CategoricalOrdinalEnc', SCALAR='ScalarRobustScalerEnc', DATETIME='DatetimeScalarEnc', LATLONG='LatLongScalarEnc', TEXT='TextSummaryScalarEnc'), early_stopping_patience=epochs, early_stopping_metric='loss', max_nodes_per_graph=False, train_fraction_to_use=1.0, dataset_name=ds_name, device='cuda', find_lr=False, epochs=epochs, batch_size=batch_size, num_workers=8) # LR Schedule kwargs.update( lr_scheduler_class_name='StepLR', lr_scheduler_kwargs=dict(step_size=1, gamma=1.0), ) # Optimizer kwargs.update( optimizer_class_name='AdamW', optimizer_kwargs=dict( lr=5e-4, weight_decay=weight_decay, ), wd_bias=False, wd_embed=False, wd_bn=False, ) # Sampler sampler_class_name = 'RandomSampler' sampler_class_kwargs = {} kwargs.update(sampler_class_name=sampler_class_name, sampler_class_kwargs=sampler_class_kwargs) # Model kwargs.update(model_class_name=model_class_name, model_kwargs=dict()) kwargs['model_kwargs'].update(max_emb_dim=32, p_dropout=0.0, one_hot_embeddings=True, drop_whole_embeddings=False, activation_class_name='SELU', activation_class_kwargs={}, norm_class_name='BatchNorm1d', norm_class_kwargs={}, loss_class_name='CrossEntropyLoss', loss_class_kwargs={}) return kwargs
print(f'Already made new dataset. Moving on.') else: print( f'Appending acts to original dataset and saving to {new_ds_file}' ) orig_dataset_name = f'{db_name}_main_table' train_dp_ids, test_dp_ids = get_train_test_dp_ids( orig_dataset_name) dp_ids = np.concatenate([ train_dp_ids, test_dp_ids ]) if test_dp_ids is not None else train_dp_ids orig_dataset = TabularDataset(orig_dataset_name, dp_ids, encoders=None) orig_data = orig_dataset.raw_data orig_dataset_ds_info = get_ds_info(orig_dataset_name) orig_dataset_ds_info['processed'][ 'local_path'] = new_ds_file acts = acts.set_index(orig_data.index) act_cols = [ f'{model_name}_act{i}' for i in acts.columns ] acts = acts.rename(columns={ i: f'{model_name}_act{i}' for i in acts.columns }) targets = orig_dataset.targets.numpy() targets = pd.DataFrame({ 'TARGET': [i if i in [0, 1] else np.nan for i in targets]