def get_loaders(self, db_name, encoders, batch_size, num_workers):
     db_info = get_db_info(db_name)
     max_nodes_per_graph = None
     _ = get_db_container(db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=db_name,
         train_test_split='use_full_train',
         encoders=encoders)
     train_loader = get_dataloader(dataset=train_data,
                                   batch_size=batch_size,
                                   sampler_class_name='SequentialSampler',
                                   num_workers=num_workers,
                                   max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(dataset=val_data,
                                 batch_size=batch_size,
                                 sampler_class_name='SequentialSampler',
                                 num_workers=num_workers,
                                 max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(dataset=test_data,
                                  batch_size=batch_size,
                                  sampler_class_name='SequentialSampler',
                                  num_workers=num_workers,
                                  max_nodes_per_graph=max_nodes_per_graph)
     loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
     return db_info, loaders
def build_database_from_kaggle_files():
    # Fix errors in essays.csv file
    local_data_dir = os.path.join(data_root, 'raw_data', db_name)
    subprocess.run(
        'unzip \'{dir}/*.zip\' -d {dir} '.format(dir=local_data_dir),
        shell=True)

    container_id = get_db_container(db_name)
    cmd = 'docker exec -i {} cypher-shell < {}'.format(
        container_id,
        os.path.join(data_root, db_name,
                     'homecreditdefaultrisk_neo4j_loader.cypher'))
    print(cmd)
    subprocess.run(cmd, shell=True)
def build_database_from_kaggle_files():
    # Fix errors in essays.csv file
    local_essays_dir = os.path.join(data_root, 'raw_data', db_name)
    local_essays_path = os.path.join(local_essays_dir, 'essays.csv')
    subprocess.run('unzip \'{dir}/*.zip\' -d {dir} '.format(dir=local_essays_dir), shell=True)
    subprocess.run('sed -i \'s/\\\\\\\\""//g\' {}'.format(local_essays_path), shell=True)
    subprocess.run('sed -i \'s/\\\\""//g\' {}'.format(local_essays_path), shell=True)
    subprocess.run('sed -i \'s/\\\\",/",/g\' {}'.format(local_essays_path), shell=True)
    subprocess.run('sed -i \'s/\\\\"$/"/g\' {}'.format(local_essays_path), shell=True)

    container_id = get_db_container(db_name)
    cmd = 'docker exec -i {} cypher-shell < {}'.format(container_id,
                                                       os.path.join(data_root, db_name,
                                                                    'kddcup2014_neo4j_loader.cypher'))
    print(cmd)
    subprocess.run(cmd, shell=True)
Exemple #4
0
def build_database_from_kaggle_files():
    # Add unique ids to the transactions.csv file
    local_transactions_csv_path = os.path.join(data_root, 'raw_data', db_name,
                                               'transactions.csv')
    local_temp_filepath = os.path.join(data_root, 'raw_data', db_name,
                                       'temp.csv')
    subprocess.run('head -n 1 {} | sed "s|^id,|id,history,|g" > {}'.format(
        local_transactions_csv_path, local_temp_filepath),
                   shell=True)
    subprocess.run(
        'tail -n +2 {} | pv | awk \'{{printf("%d,%s\\n", NR, $0)}}\' >> {}'.
        format(local_transactions_csv_path, local_temp_filepath),
        shell=True)

    # Load data into database
    container_id = get_db_container(db_name)
    cmd = 'docker exec -i {} cypher-shell < {}'.format(
        container_id,
        os.path.join(data_root, db_name,
                     '{}_neo4j_loader.cypher'.format(db_name)))
    print(cmd)
    subprocess.run(cmd, shell=True)
Exemple #5
0
 def setUp(self):
     self.db_info = get_db_info(self.db_name)
     batch_size = 1
     num_workers = 0
     max_nodes_per_graph = 100000
     _ = get_db_container(self.db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=self.db_name,
         train_test_split='use_full_train',
         encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                       SCALAR='ScalarRobustScalerEnc',
                       DATETIME='DatetimeScalarEnc',
                       LATLONG='LatLongScalarEnc',
                       TEXT='TextSummaryScalarEnc'),
     )
     train_loader = get_dataloader(
         dataset=train_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(
         dataset=val_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(
         dataset=test_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     self.loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
from tqdm import tqdm

from data.utils import get_db_container

db_names = ('acquirevaluedshopperschallenge', 'homecreditdefaultrisk',
            'kddcup2014')

if __name__ == '__main__':
    while True:
        inp = input('Re-extract dataset info? (y/n): ')
        if inp in ['y', 'n']:
            break

    for db_name in db_names:
        print(f'Doing {db_name}')
        _ = get_db_container(db_name)
        train_dataset, val_dataset, test_dataset = get_train_val_test_datasets(
            db_name, 'use_full_train')
        datasets = {
            'train': train_dataset,
            'val': val_dataset,
            'test': test_dataset
        }
        df_graph_info_path = f'./experiments/{db_name}_df_graph_info.pkl'
        df_node_info_path = f'./experiments/{db_name}_df_node_info.pkl'

        if inp == 'y':
            n_nodes = []
            n_edges = []
            n_in_edges = []
            n_out_edges = []