def get_train_test_dp_ids(dataset_name):
    db_name = None
    if 'acquirevaluedshopperschallenge' in dataset_name:
        db_name = 'acquirevaluedshopperschallenge'
    elif 'homecreditdefaultrisk' in dataset_name:
        db_name = 'homecreditdefaultrisk'
    elif 'kddcup2014' in dataset_name:
        db_name = 'kddcup2014'
    if db_name is not None:
        db_info = get_db_info(db_name)
        train_dp_ids = db_info['train_dp_ids']
        test_dp_ids = db_info['test_dp_ids']
    else:
        ds_info = get_ds_info(dataset_name)
        n_datapoints = ds_info['meta']['n_datapoints']
        train_dp_ids = np.arange(n_datapoints)
        test_dp_ids = None

    return train_dp_ids, test_dp_ids
    def __init__(self, dataset_name=None, datapoint_ids=None, encoders=None):
        self.ds_name = dataset_name
        self.datapoint_ids = datapoint_ids
        self.encoders = encoders
        self.ds_info = get_ds_info(dataset_name)
        raw_data_path = os.path.join(data_root, self.ds_info['processed']['local_path'])
        if 'acquirevaluedshopperschallenge' in dataset_name:
            self.raw_data = pd.read_csv(raw_data_path)
            self.raw_data.set_index('id', inplace=True)
        elif 'homecreditdefaultrisk' in dataset_name:
            self.raw_data = pd.read_csv(raw_data_path)
            self.raw_data.set_index('SK_ID_CURR', inplace=True)
        elif 'kddcup2014' in dataset_name:
            self.raw_data = pd.read_csv(raw_data_path)
            self.raw_data.set_index('projectid', inplace=True)
        else:
            col_names = [c['name'] for c in self.ds_info['meta']['columns']]
            self.raw_data = pd.read_csv(raw_data_path, header=None, names=col_names)

        if datapoint_ids is not None:
            self.raw_data = self.raw_data.loc[datapoint_ids]
        if self.ds_info['processed']['task'] == 'regression':
            targets = np.array(self.raw_data['TARGET']).astype(np.float)
            self.targets = torch.Tensor(targets)
        else:
            def tfm(x):
                if x in ['0', '-1', '0.0', 'no', 'No', 'neg', 'n', 'N', 'False', 'NRB', ' <=50K']:
                    return 0
                elif x == 'nan':
                    return pd.np.nan
                else:
                    return 1

            targets = self.raw_data['TARGET'].astype(str).transform(tfm)
            targets = np.array(targets).astype(np.float)
            self.targets = torch.LongTensor(targets)
        self.raw_data = self.raw_data[[i for i in self.raw_data.columns if i != 'TARGET']]

        self.columns = self.ds_info['meta']['columns'][1:]  # Omitting the target column
        self.cat_feat_origin_cards = None
        self.cont_feat_origin = None
        self.feature_encoders = None
Exemple #3
0
    def __init__(self, writer, dataset_name, n_cont_features, cat_feat_origin_cards, max_emb_dim,
                 activation_class_name, activation_class_kwargs, norm_class_name, norm_class_kwargs, p_dropout,
                 one_hot_embeddings, drop_whole_embeddings, loss_class_name=None, loss_class_kwargs=None,
                 n_out=None):
        super().__init__()
        self.writer = writer
        if dataset_name is not None:
            assert n_out is None
            self.ds_info = get_ds_info(dataset_name)
            task = self.ds_info['processed']['task']
            if task == 'binary classification':
                self.n_out = 2
            elif task == 'multiclass classification':
                raise NotImplementedError  # todo
            elif task == 'regression':
                self.n_out = 1
            self.act_on_output = False
        else:
            assert n_out is not None
            self.n_out = n_out
            self.act_on_output = True
        self.n_cont_features = n_cont_features
        self.cat_feat_origin_cards = cat_feat_origin_cards

        self.p_dropout = p_dropout
        self.drop_whole_embeddings = drop_whole_embeddings
        self.one_hot_embeddings = one_hot_embeddings
        self.act_class = activations.__dict__[activation_class_name]
        self.act_class_kwargs = activation_class_kwargs
        self.norm_class = nn.__dict__[norm_class_name]
        self.norm_class_kwargs = norm_class_kwargs
        self.loss_class_name = loss_class_name
        self.loss_class_kwargs = loss_class_kwargs
        self.cat_initializers = nn.ModuleDict()
        if isinstance(self.cat_feat_origin_cards, list):
            for col_name, card in self.cat_feat_origin_cards:
                self.cat_initializers[col_name] = EmbeddingInitializer(card, max_emb_dim, p_dropout,
                                                                       drop_whole_embeddings=drop_whole_embeddings,
                                                                       one_hot=one_hot_embeddings)
            self.init_feat_dim = sum(i.emb_dim for i in self.cat_initializers.values()) + self.n_cont_features
def get_kwargs(ds_name):
    ds_info = get_ds_info(ds_name)
    n_datapoints = ds_info['meta']['n_datapoints']
    n_columns = len(ds_info['meta']['columns'])

    weight_decay = 0.01

    ######################
    # Basic kwargs
    epochs = 500
    max_batch = 1024
    batch_size = min(n_datapoints // 30, max_batch)
    if batch_size == max_batch:
        batch_size // max(1, math.log10(n_columns) - 1)
    kwargs = dict(seed=1234,
                  debug_network=False,
                  encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                                SCALAR='ScalarRobustScalerEnc',
                                DATETIME='DatetimeScalarEnc',
                                LATLONG='LatLongScalarEnc',
                                TEXT='TextSummaryScalarEnc'),
                  early_stopping_patience=epochs,
                  early_stopping_metric='loss',
                  max_nodes_per_graph=False,
                  train_fraction_to_use=1.0,
                  dataset_name=ds_name,
                  device='cuda',
                  find_lr=False,
                  epochs=epochs,
                  batch_size=batch_size,
                  num_workers=8)
    # LR Schedule
    kwargs.update(
        lr_scheduler_class_name='StepLR',
        lr_scheduler_kwargs=dict(step_size=1, gamma=1.0),
    )
    # Optimizer
    kwargs.update(
        optimizer_class_name='AdamW',
        optimizer_kwargs=dict(
            lr=5e-4,
            weight_decay=weight_decay,
        ),
        wd_bias=False,
        wd_embed=False,
        wd_bn=False,
    )
    # Sampler
    sampler_class_name = 'RandomSampler'
    sampler_class_kwargs = {}
    kwargs.update(sampler_class_name=sampler_class_name,
                  sampler_class_kwargs=sampler_class_kwargs)
    # Model
    kwargs.update(model_class_name=model_class_name, model_kwargs=dict())
    kwargs['model_kwargs'].update(max_emb_dim=32,
                                  p_dropout=0.0,
                                  one_hot_embeddings=True,
                                  drop_whole_embeddings=False,
                                  activation_class_name='SELU',
                                  activation_class_kwargs={},
                                  norm_class_name='BatchNorm1d',
                                  norm_class_kwargs={},
                                  loss_class_name='CrossEntropyLoss',
                                  loss_class_kwargs={})

    return kwargs
                        print(f'Already made new dataset.  Moving on.')
                    else:
                        print(
                            f'Appending acts to original dataset and saving to {new_ds_file}'
                        )
                        orig_dataset_name = f'{db_name}_main_table'
                        train_dp_ids, test_dp_ids = get_train_test_dp_ids(
                            orig_dataset_name)
                        dp_ids = np.concatenate([
                            train_dp_ids, test_dp_ids
                        ]) if test_dp_ids is not None else train_dp_ids
                        orig_dataset = TabularDataset(orig_dataset_name,
                                                      dp_ids,
                                                      encoders=None)
                        orig_data = orig_dataset.raw_data
                        orig_dataset_ds_info = get_ds_info(orig_dataset_name)
                        orig_dataset_ds_info['processed'][
                            'local_path'] = new_ds_file

                        acts = acts.set_index(orig_data.index)
                        act_cols = [
                            f'{model_name}_act{i}' for i in acts.columns
                        ]
                        acts = acts.rename(columns={
                            i: f'{model_name}_act{i}'
                            for i in acts.columns
                        })
                        targets = orig_dataset.targets.numpy()
                        targets = pd.DataFrame({
                            'TARGET':
                            [i if i in [0, 1] else np.nan for i in targets]