def get_loaders(self, db_name, encoders, batch_size, num_workers): db_info = get_db_info(db_name) max_nodes_per_graph = None _ = get_db_container(db_name) train_data, val_data, test_data = get_train_val_test_datasets( dataset_name=db_name, train_test_split='use_full_train', encoders=encoders) train_loader = get_dataloader(dataset=train_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) val_loader = get_dataloader(dataset=val_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) test_loader = get_dataloader(dataset=test_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) loaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader } return db_info, loaders
def create_datapoint_from_database(db_name, base_query, target_dir, dp_id): db_info = get_db_info(db_name) # Get graph from database driver = get_neo4j_db_driver(db_name) with driver.session() as session: query = base_query.format(dp_id) result = session.run(query) g = result.graph() # Construct DGLGraph for each neo4j graph, and batch them # Also collect the features and labels from each graph label_node_type, label_feature_name = db_info['label_feature'].split('.') features = {} for node_type in db_info['node_types_and_features'].keys(): features[node_type] = {} for feature_name in db_info['node_types_and_features'][node_type].keys( ): # Making sure not to include the label value among the training features if not (node_type == label_node_type and feature_name == label_feature_name): features[node_type][feature_name] = [] neo4j_id_to_graph_idx = {node.id: idx for idx, node in enumerate(g.nodes)} node_types = [None] * len(g.nodes) for node in g.nodes: node_type = tuple(node.labels)[0] node_idx = neo4j_id_to_graph_idx[node.id] node_types[node_idx] = db_info['node_type_to_int'][node_type] for feature_name, feature_values in features[node_type].items(): # Dealing with latlongs if db_info['node_types_and_features'][node_type][feature_name][ 'type'] == 'LATLONG': lat_name, lon_name = feature_name.split('+++') value = (node.get(lat_name), node.get(lon_name)) else: value = node.get(feature_name) # neotime doesn't pickle well if isinstance(value, (neotime.Date, neotime.DateTime)): value = value.to_native() feature_values.append(value) if node_type == label_node_type: label = node.get(label_feature_name) edge_list = [] edge_types = [] for rel in g.relationships: start_node_idx = neo4j_id_to_graph_idx[rel.start_node.id] end_node_idx = neo4j_id_to_graph_idx[rel.end_node.id] edge_list.append((start_node_idx, end_node_idx)) edge_types.append(db_info['edge_type_to_int'][rel.type]) with open(os.path.join(target_dir, str(dp_id)), 'wb') as f: dp_tuple = (edge_list, node_types, edge_types, features, label) pickle.dump(dp_tuple, f)
def test_memorize_minibatch(self): for db_name in self.db_names: db_info = get_db_info(db_name) train_data, val_data, _ = get_train_val_test_datasets( dataset_name=db_name, train_test_split='use_full_train', encoders=dict(CATEGORICAL='CategoricalOrdinalEnc', SCALAR='ScalarRobustScalerEnc', DATETIME='DatetimeScalarEnc', LATLONG='LatLongScalarEnc', TEXT='TextSummaryScalarEnc'), ) train_loader = get_dataloader( dataset=train_data, batch_size=256, sampler_class_name='SequentialSampler', num_workers=0, max_nodes_per_graph=False) writer = DummyWriter() model = GCN(writer, db_info=db_info, hidden_dim=256, n_init_layers=3, activation_class_name='SELU', activation_class_kwargs={}, loss_class_kwargs={}, loss_class_name='CrossEntropyLoss', p_dropout=0.0, drop_whole_embeddings=True, n_layers=3, readout_class_name='AvgPooling', readout_kwargs={}) if torch.cuda.is_available(): model.cuda() model.device = torch.device('cuda:0') else: model.device = torch.device('cpu') model.train() optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.0) bdgl, features, label = next(iter(train_loader)) recursive_to((bdgl, features, label), model.device) for _ in tqdm(range(200)): optimizer.zero_grad() output = model(bdgl, features) loss = model.loss_fxn(output, label) if loss < 1e-4: break loss.backward() optimizer.step() else: tqdm.write(f'Loss: {loss}') self.fail("Didn't memorize minibatch")
def __init__(self, db_name=None, datapoint_ids=None, encoders=None): self.db_name = db_name self.datapoint_ids = datapoint_ids self.data_dir = os.path.join(data_root, self.db_name, 'preprocessed_datapoints') os.makedirs(self.data_dir, exist_ok=True) self.db_info = get_db_info(self.db_name) # Download data if necessary all_dps_present = len(os.listdir(self.data_dir)) == self.db_info['task']['n_train'] + self.db_info['task'][ 'n_test'] assert all_dps_present self.feature_encoders = {} for node_type, features in self.db_info['node_types_and_features'].items(): self.feature_encoders[node_type] = dict() for feature_name, feature_info in features.items(): if feature_info['type'] == 'CATEGORICAL': enc = CategoricalOrdinalEnc(feature_info['sorted_values']) elif feature_info['type'] == 'SCALAR': s_enc = encoders['SCALAR'] if s_enc == 'ScalarRobustScalerEnc': enc = ScalarRobustScalerEnc(feature_info['RobustScaler_center_'], feature_info['RobustScaler_scale_']) elif s_enc == 'ScalarPowerTransformerEnc': enc = ScalarPowerTransformerEnc(feature_info['PowerTransformer_lambdas_'], feature_info['PowerTransformer_scale_'], feature_info['PowerTransformer_mean_'], feature_info['PowerTransformer_var_'], feature_info['PowerTransformer_n_samples_seen_']) elif s_enc == 'ScalarQuantileTransformerEnc': enc = ScalarQuantileTransformerEnc(feature_info['QuantileTransformer_n_quantiles_'], feature_info['QuantileTransformer_quantiles_'], feature_info['QuantileTransformer_references_']) elif s_enc == 'ScalarQuantileOrdinalEnc': enc = ScalarQuantileOrdinalEnc(feature_info['KBinsDiscretizer_n_bins_'], feature_info['KBinsDiscretizer_bin_edges_']) else: raise ValueError(f'scalar encoder {s_enc} not recognized') elif feature_info['type'] == 'DATETIME': enc = DatetimeScalarEnc() elif feature_info['type'] == 'LATLONG': enc = LatLongScalarEnc() elif feature_info['type'] == 'TEXT': t_enc = encoders['TEXT'] if t_enc == 'TfidfEnc': enc = TfidfEnc(feature_info['Tfidf_vocabulary_'], feature_info['Tfidf_idf_']) elif t_enc == 'TextSummaryScalarEnc': enc = TextSummaryScalarEnc(feature_info['RobustScaler_center_'], feature_info['RobustScaler_scale_']) self.feature_encoders[node_type][feature_name] = enc
def get_train_test_dp_ids(dataset_name): db_name = None if 'acquirevaluedshopperschallenge' in dataset_name: db_name = 'acquirevaluedshopperschallenge' elif 'homecreditdefaultrisk' in dataset_name: db_name = 'homecreditdefaultrisk' elif 'kddcup2014' in dataset_name: db_name = 'kddcup2014' if db_name is not None: db_info = get_db_info(db_name) train_dp_ids = db_info['train_dp_ids'] test_dp_ids = db_info['test_dp_ids'] else: ds_info = get_ds_info(dataset_name) n_datapoints = ds_info['meta']['n_datapoints'] train_dp_ids = np.arange(n_datapoints) test_dp_ids = None return train_dp_ids, test_dp_ids
def setUp(self): self.db_info = get_db_info(self.db_name) batch_size = 1 num_workers = 0 max_nodes_per_graph = 100000 _ = get_db_container(self.db_name) train_data, val_data, test_data = get_train_val_test_datasets( dataset_name=self.db_name, train_test_split='use_full_train', encoders=dict(CATEGORICAL='CategoricalOrdinalEnc', SCALAR='ScalarRobustScalerEnc', DATETIME='DatetimeScalarEnc', LATLONG='LatLongScalarEnc', TEXT='TextSummaryScalarEnc'), ) train_loader = get_dataloader( dataset=train_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) val_loader = get_dataloader( dataset=val_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) test_loader = get_dataloader( dataset=test_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) self.loaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader }
import numpy as np import pandas as pd from __init__ import data_root from data.utils import get_db_info, set_entity_variable_types, compute_and_save_dfs_features db_name = 'homecreditdefaultrisk' dp_limit = None max_depth = 2 n_jobs = 1 chunk_size = 1000 if __name__ == '__main__': print('Loading data') data_dir = os.path.join(data_root, 'raw_data', db_name) db_info = get_db_info(db_name) app_train = pd.read_csv(os.path.join(data_dir, 'application_train.csv'), nrows=dp_limit, index_col=False) app_test = pd.read_csv(os.path.join(data_dir, 'application_test.csv'), nrows=dp_limit, index_col=False) bureau = pd.read_csv(os.path.join(data_dir, 'bureau.csv'), nrows=dp_limit, index_col=False) bureau_balance = pd.read_csv(os.path.join(data_dir, 'bureau_balance.csv'), nrows=dp_limit, index_col=False) cash = pd.read_csv(os.path.join(data_dir, 'POS_CASH_balance.csv'), nrows=dp_limit, index_col=False) credit = pd.read_csv(os.path.join(data_dir, 'credit_card_balance.csv'), nrows=dp_limit, index_col=False) previous = pd.read_csv(os.path.join(data_dir, 'previous_application.csv'), nrows=dp_limit, index_col=False) installments = pd.read_csv(os.path.join(data_dir, 'installments_payments.csv'), nrows=dp_limit, index_col=False) app_train['TARGET'] = app_train['TARGET'].astype(np.float64) app_test["TARGET"] = np.nan app = app_train.append(app_test, ignore_index=True) # Make entity set
def __init__(self, writer, dataset_name, feature_encoders, hidden_dim, init_model_class_name, init_model_kwargs, n_layers, activation_class_name, activation_class_kwargs, norm_class_name, norm_class_kwargs, loss_class_kwargs, loss_class_name, p_dropout, readout_class_name, readout_kwargs, fcout_layer_sizes): super(GNNModelBase, self).__init__() self.writer = writer self.db_info = get_db_info(dataset_name) self.n_out = self.db_info['task']['n_classes'] self.feature_encoders = feature_encoders self.init_model_class = tab_models.__dict__[init_model_class_name] self.init_model_kwargs = init_model_kwargs self.hidden_dim = hidden_dim self.p_dropout = p_dropout self.n_layers = n_layers if loss_class_kwargs.get('weight', None): loss_class_kwargs['weight'] = torch.Tensor(loss_class_kwargs['weight']) self.act_class = activations.__dict__[activation_class_name] self.act_class_kwargs = activation_class_kwargs self.norm_class = nn.__dict__[norm_class_name] self.norm_class_kwargs = norm_class_kwargs self.loss_fxn = losses.__dict__[loss_class_name](self, **loss_class_kwargs) # Create self.initializers for use in self.init_batch self.node_initializers = nn.ModuleDict() self.node_init_info = {} for node_type, features in self.db_info['node_types_and_features'].items(): cat_feat_origin_cards = [] cont_feat_origin = [] for feature_name, feature_info in features.items(): if '{}.{}'.format(node_type, feature_name) != self.db_info['label_feature']: enc = self.feature_encoders[node_type][feature_name] cat_feat_origin_cards += [(f'{feature_name}_{i}', card) for i, card in enumerate(enc.cat_cards)] cont_feat_origin += [feature_name] * enc.cont_dim self.node_init_info[node_type] = { 'cat_feat_origin_cards': cat_feat_origin_cards, 'cont_feat_origin': cont_feat_origin, } self.node_initializers[node_type] = self.init_model_class(writer=writer, dataset_name=None, n_cont_features=len(cont_feat_origin), cat_feat_origin_cards=cat_feat_origin_cards, n_out=hidden_dim, **self.init_model_kwargs) # Create readout function self.readout = readouts.__dict__[readout_class_name](hidden_dim=hidden_dim, **readout_kwargs) # Create MLP "fcout" to produce output of model from output of readout if all(isinstance(s, float) for s in fcout_layer_sizes): fcout_layer_sizes = [int(self.hidden_dim * s) for s in fcout_layer_sizes] assert all(isinstance(s, int) for s in fcout_layer_sizes) self.layer_sizes = fcout_layer_sizes fcout_layers = [] prev_layer_size = self.hidden_dim for layer_size in self.layer_sizes: fcout_layers.append(nn.Linear(prev_layer_size, layer_size)) fcout_layers.append(self.get_act()) fcout_layers.append(self.get_norm(layer_size)) fcout_layers.append(nn.Dropout(self.p_dropout)) prev_layer_size = layer_size fcout_layers.append(nn.Linear(prev_layer_size, self.n_out)) self.fcout = nn.Sequential(*fcout_layers)