def get_loaders(self, db_name, encoders, batch_size, num_workers):
     db_info = get_db_info(db_name)
     max_nodes_per_graph = None
     _ = get_db_container(db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=db_name,
         train_test_split='use_full_train',
         encoders=encoders)
     train_loader = get_dataloader(dataset=train_data,
                                   batch_size=batch_size,
                                   sampler_class_name='SequentialSampler',
                                   num_workers=num_workers,
                                   max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(dataset=val_data,
                                 batch_size=batch_size,
                                 sampler_class_name='SequentialSampler',
                                 num_workers=num_workers,
                                 max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(dataset=test_data,
                                  batch_size=batch_size,
                                  sampler_class_name='SequentialSampler',
                                  num_workers=num_workers,
                                  max_nodes_per_graph=max_nodes_per_graph)
     loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
     return db_info, loaders
def create_datapoint_from_database(db_name, base_query, target_dir, dp_id):
    db_info = get_db_info(db_name)

    # Get graph from database
    driver = get_neo4j_db_driver(db_name)
    with driver.session() as session:
        query = base_query.format(dp_id)
        result = session.run(query)
        g = result.graph()

    # Construct DGLGraph for each neo4j graph, and batch them
    # Also collect the features and labels from each graph
    label_node_type, label_feature_name = db_info['label_feature'].split('.')
    features = {}
    for node_type in db_info['node_types_and_features'].keys():
        features[node_type] = {}
        for feature_name in db_info['node_types_and_features'][node_type].keys(
        ):
            # Making sure not to include the label value among the training features
            if not (node_type == label_node_type
                    and feature_name == label_feature_name):
                features[node_type][feature_name] = []

    neo4j_id_to_graph_idx = {node.id: idx for idx, node in enumerate(g.nodes)}
    node_types = [None] * len(g.nodes)
    for node in g.nodes:
        node_type = tuple(node.labels)[0]
        node_idx = neo4j_id_to_graph_idx[node.id]
        node_types[node_idx] = db_info['node_type_to_int'][node_type]
        for feature_name, feature_values in features[node_type].items():
            # Dealing with latlongs
            if db_info['node_types_and_features'][node_type][feature_name][
                    'type'] == 'LATLONG':
                lat_name, lon_name = feature_name.split('+++')
                value = (node.get(lat_name), node.get(lon_name))
            else:
                value = node.get(feature_name)
            # neotime doesn't pickle well
            if isinstance(value, (neotime.Date, neotime.DateTime)):
                value = value.to_native()
            feature_values.append(value)
        if node_type == label_node_type:
            label = node.get(label_feature_name)

    edge_list = []
    edge_types = []
    for rel in g.relationships:
        start_node_idx = neo4j_id_to_graph_idx[rel.start_node.id]
        end_node_idx = neo4j_id_to_graph_idx[rel.end_node.id]
        edge_list.append((start_node_idx, end_node_idx))
        edge_types.append(db_info['edge_type_to_int'][rel.type])

    with open(os.path.join(target_dir, str(dp_id)), 'wb') as f:
        dp_tuple = (edge_list, node_types, edge_types, features, label)
        pickle.dump(dp_tuple, f)
    def test_memorize_minibatch(self):
        for db_name in self.db_names:
            db_info = get_db_info(db_name)
            train_data, val_data, _ = get_train_val_test_datasets(
                dataset_name=db_name,
                train_test_split='use_full_train',
                encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                              SCALAR='ScalarRobustScalerEnc',
                              DATETIME='DatetimeScalarEnc',
                              LATLONG='LatLongScalarEnc',
                              TEXT='TextSummaryScalarEnc'),
            )
            train_loader = get_dataloader(
                dataset=train_data,
                batch_size=256,
                sampler_class_name='SequentialSampler',
                num_workers=0,
                max_nodes_per_graph=False)

            writer = DummyWriter()
            model = GCN(writer,
                        db_info=db_info,
                        hidden_dim=256,
                        n_init_layers=3,
                        activation_class_name='SELU',
                        activation_class_kwargs={},
                        loss_class_kwargs={},
                        loss_class_name='CrossEntropyLoss',
                        p_dropout=0.0,
                        drop_whole_embeddings=True,
                        n_layers=3,
                        readout_class_name='AvgPooling',
                        readout_kwargs={})
            if torch.cuda.is_available():
                model.cuda()
                model.device = torch.device('cuda:0')
            else:
                model.device = torch.device('cpu')
            model.train()
            optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.0)

            bdgl, features, label = next(iter(train_loader))
            recursive_to((bdgl, features, label), model.device)
            for _ in tqdm(range(200)):
                optimizer.zero_grad()
                output = model(bdgl, features)
                loss = model.loss_fxn(output, label)
                if loss < 1e-4:
                    break
                loss.backward()
                optimizer.step()
            else:
                tqdm.write(f'Loss: {loss}')
                self.fail("Didn't memorize minibatch")
Exemple #4
0
    def __init__(self, db_name=None, datapoint_ids=None, encoders=None):
        self.db_name = db_name
        self.datapoint_ids = datapoint_ids
        self.data_dir = os.path.join(data_root, self.db_name, 'preprocessed_datapoints')
        os.makedirs(self.data_dir, exist_ok=True)

        self.db_info = get_db_info(self.db_name)

        # Download data if necessary
        all_dps_present = len(os.listdir(self.data_dir)) == self.db_info['task']['n_train'] + self.db_info['task'][
            'n_test']
        assert all_dps_present

        self.feature_encoders = {}
        for node_type, features in self.db_info['node_types_and_features'].items():
            self.feature_encoders[node_type] = dict()
            for feature_name, feature_info in features.items():
                if feature_info['type'] == 'CATEGORICAL':
                    enc = CategoricalOrdinalEnc(feature_info['sorted_values'])
                elif feature_info['type'] == 'SCALAR':
                    s_enc = encoders['SCALAR']
                    if s_enc == 'ScalarRobustScalerEnc':
                        enc = ScalarRobustScalerEnc(feature_info['RobustScaler_center_'],
                                                    feature_info['RobustScaler_scale_'])
                    elif s_enc == 'ScalarPowerTransformerEnc':
                        enc = ScalarPowerTransformerEnc(feature_info['PowerTransformer_lambdas_'],
                                                        feature_info['PowerTransformer_scale_'],
                                                        feature_info['PowerTransformer_mean_'],
                                                        feature_info['PowerTransformer_var_'],
                                                        feature_info['PowerTransformer_n_samples_seen_'])
                    elif s_enc == 'ScalarQuantileTransformerEnc':
                        enc = ScalarQuantileTransformerEnc(feature_info['QuantileTransformer_n_quantiles_'],
                                                           feature_info['QuantileTransformer_quantiles_'],
                                                           feature_info['QuantileTransformer_references_'])
                    elif s_enc == 'ScalarQuantileOrdinalEnc':
                        enc = ScalarQuantileOrdinalEnc(feature_info['KBinsDiscretizer_n_bins_'],
                                                       feature_info['KBinsDiscretizer_bin_edges_'])
                    else:
                        raise ValueError(f'scalar encoder {s_enc} not recognized')
                elif feature_info['type'] == 'DATETIME':
                    enc = DatetimeScalarEnc()
                elif feature_info['type'] == 'LATLONG':
                    enc = LatLongScalarEnc()
                elif feature_info['type'] == 'TEXT':
                    t_enc = encoders['TEXT']
                    if t_enc == 'TfidfEnc':
                        enc = TfidfEnc(feature_info['Tfidf_vocabulary_'],
                                       feature_info['Tfidf_idf_'])
                    elif t_enc == 'TextSummaryScalarEnc':
                        enc = TextSummaryScalarEnc(feature_info['RobustScaler_center_'],
                                                   feature_info['RobustScaler_scale_'])
                self.feature_encoders[node_type][feature_name] = enc
def get_train_test_dp_ids(dataset_name):
    db_name = None
    if 'acquirevaluedshopperschallenge' in dataset_name:
        db_name = 'acquirevaluedshopperschallenge'
    elif 'homecreditdefaultrisk' in dataset_name:
        db_name = 'homecreditdefaultrisk'
    elif 'kddcup2014' in dataset_name:
        db_name = 'kddcup2014'
    if db_name is not None:
        db_info = get_db_info(db_name)
        train_dp_ids = db_info['train_dp_ids']
        test_dp_ids = db_info['test_dp_ids']
    else:
        ds_info = get_ds_info(dataset_name)
        n_datapoints = ds_info['meta']['n_datapoints']
        train_dp_ids = np.arange(n_datapoints)
        test_dp_ids = None

    return train_dp_ids, test_dp_ids
Exemple #6
0
 def setUp(self):
     self.db_info = get_db_info(self.db_name)
     batch_size = 1
     num_workers = 0
     max_nodes_per_graph = 100000
     _ = get_db_container(self.db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=self.db_name,
         train_test_split='use_full_train',
         encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                       SCALAR='ScalarRobustScalerEnc',
                       DATETIME='DatetimeScalarEnc',
                       LATLONG='LatLongScalarEnc',
                       TEXT='TextSummaryScalarEnc'),
     )
     train_loader = get_dataloader(
         dataset=train_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(
         dataset=val_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(
         dataset=test_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     self.loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
Exemple #7
0
import numpy as np
import pandas as pd

from __init__ import data_root
from data.utils import get_db_info, set_entity_variable_types, compute_and_save_dfs_features

db_name = 'homecreditdefaultrisk'
dp_limit = None
max_depth = 2
n_jobs = 1
chunk_size = 1000

if __name__ == '__main__':
    print('Loading data')
    data_dir = os.path.join(data_root, 'raw_data', db_name)
    db_info = get_db_info(db_name)

    app_train = pd.read_csv(os.path.join(data_dir, 'application_train.csv'), nrows=dp_limit, index_col=False)
    app_test = pd.read_csv(os.path.join(data_dir, 'application_test.csv'), nrows=dp_limit, index_col=False)
    bureau = pd.read_csv(os.path.join(data_dir, 'bureau.csv'), nrows=dp_limit, index_col=False)
    bureau_balance = pd.read_csv(os.path.join(data_dir, 'bureau_balance.csv'), nrows=dp_limit, index_col=False)
    cash = pd.read_csv(os.path.join(data_dir, 'POS_CASH_balance.csv'), nrows=dp_limit, index_col=False)
    credit = pd.read_csv(os.path.join(data_dir, 'credit_card_balance.csv'), nrows=dp_limit, index_col=False)
    previous = pd.read_csv(os.path.join(data_dir, 'previous_application.csv'), nrows=dp_limit, index_col=False)
    installments = pd.read_csv(os.path.join(data_dir, 'installments_payments.csv'), nrows=dp_limit, index_col=False)

    app_train['TARGET'] = app_train['TARGET'].astype(np.float64)
    app_test["TARGET"] = np.nan
    app = app_train.append(app_test, ignore_index=True)

    # Make entity set
Exemple #8
0
    def __init__(self, writer, dataset_name, feature_encoders, hidden_dim, init_model_class_name, init_model_kwargs,
                 n_layers, activation_class_name, activation_class_kwargs, norm_class_name, norm_class_kwargs,
                 loss_class_kwargs, loss_class_name, p_dropout, readout_class_name, readout_kwargs, fcout_layer_sizes):
        super(GNNModelBase, self).__init__()
        self.writer = writer
        self.db_info = get_db_info(dataset_name)
        self.n_out = self.db_info['task']['n_classes']
        self.feature_encoders = feature_encoders
        self.init_model_class = tab_models.__dict__[init_model_class_name]
        self.init_model_kwargs = init_model_kwargs
        self.hidden_dim = hidden_dim
        self.p_dropout = p_dropout
        self.n_layers = n_layers
        if loss_class_kwargs.get('weight', None):
            loss_class_kwargs['weight'] = torch.Tensor(loss_class_kwargs['weight'])
        self.act_class = activations.__dict__[activation_class_name]
        self.act_class_kwargs = activation_class_kwargs
        self.norm_class = nn.__dict__[norm_class_name]
        self.norm_class_kwargs = norm_class_kwargs
        self.loss_fxn = losses.__dict__[loss_class_name](self, **loss_class_kwargs)

        # Create self.initializers for use in self.init_batch
        self.node_initializers = nn.ModuleDict()
        self.node_init_info = {}
        for node_type, features in self.db_info['node_types_and_features'].items():
            cat_feat_origin_cards = []
            cont_feat_origin = []
            for feature_name, feature_info in features.items():
                if '{}.{}'.format(node_type, feature_name) != self.db_info['label_feature']:
                    enc = self.feature_encoders[node_type][feature_name]
                    cat_feat_origin_cards += [(f'{feature_name}_{i}', card) for i, card in enumerate(enc.cat_cards)]
                    cont_feat_origin += [feature_name] * enc.cont_dim
            self.node_init_info[node_type] = {
                'cat_feat_origin_cards': cat_feat_origin_cards,
                'cont_feat_origin': cont_feat_origin,
            }
            self.node_initializers[node_type] = self.init_model_class(writer=writer,
                                                                      dataset_name=None,
                                                                      n_cont_features=len(cont_feat_origin),
                                                                      cat_feat_origin_cards=cat_feat_origin_cards,
                                                                      n_out=hidden_dim,
                                                                      **self.init_model_kwargs)

        # Create readout function
        self.readout = readouts.__dict__[readout_class_name](hidden_dim=hidden_dim, **readout_kwargs)

        # Create MLP "fcout" to produce output of model from output of readout
        if all(isinstance(s, float) for s in fcout_layer_sizes):
            fcout_layer_sizes = [int(self.hidden_dim * s) for s in fcout_layer_sizes]
        assert all(isinstance(s, int) for s in fcout_layer_sizes)
        self.layer_sizes = fcout_layer_sizes
        fcout_layers = []
        prev_layer_size = self.hidden_dim
        for layer_size in self.layer_sizes:
            fcout_layers.append(nn.Linear(prev_layer_size, layer_size))
            fcout_layers.append(self.get_act())
            fcout_layers.append(self.get_norm(layer_size))
            fcout_layers.append(nn.Dropout(self.p_dropout))
            prev_layer_size = layer_size
        fcout_layers.append(nn.Linear(prev_layer_size, self.n_out))
        self.fcout = nn.Sequential(*fcout_layers)