def get_loaders(self, db_name, encoders, batch_size, num_workers): db_info = get_db_info(db_name) max_nodes_per_graph = None _ = get_db_container(db_name) train_data, val_data, test_data = get_train_val_test_datasets( dataset_name=db_name, train_test_split='use_full_train', encoders=encoders) train_loader = get_dataloader(dataset=train_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) val_loader = get_dataloader(dataset=val_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) test_loader = get_dataloader(dataset=test_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) loaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader } return db_info, loaders
def build_database_from_kaggle_files(): # Fix errors in essays.csv file local_data_dir = os.path.join(data_root, 'raw_data', db_name) subprocess.run( 'unzip \'{dir}/*.zip\' -d {dir} '.format(dir=local_data_dir), shell=True) container_id = get_db_container(db_name) cmd = 'docker exec -i {} cypher-shell < {}'.format( container_id, os.path.join(data_root, db_name, 'homecreditdefaultrisk_neo4j_loader.cypher')) print(cmd) subprocess.run(cmd, shell=True)
def build_database_from_kaggle_files(): # Fix errors in essays.csv file local_essays_dir = os.path.join(data_root, 'raw_data', db_name) local_essays_path = os.path.join(local_essays_dir, 'essays.csv') subprocess.run('unzip \'{dir}/*.zip\' -d {dir} '.format(dir=local_essays_dir), shell=True) subprocess.run('sed -i \'s/\\\\\\\\""//g\' {}'.format(local_essays_path), shell=True) subprocess.run('sed -i \'s/\\\\""//g\' {}'.format(local_essays_path), shell=True) subprocess.run('sed -i \'s/\\\\",/",/g\' {}'.format(local_essays_path), shell=True) subprocess.run('sed -i \'s/\\\\"$/"/g\' {}'.format(local_essays_path), shell=True) container_id = get_db_container(db_name) cmd = 'docker exec -i {} cypher-shell < {}'.format(container_id, os.path.join(data_root, db_name, 'kddcup2014_neo4j_loader.cypher')) print(cmd) subprocess.run(cmd, shell=True)
def build_database_from_kaggle_files(): # Add unique ids to the transactions.csv file local_transactions_csv_path = os.path.join(data_root, 'raw_data', db_name, 'transactions.csv') local_temp_filepath = os.path.join(data_root, 'raw_data', db_name, 'temp.csv') subprocess.run('head -n 1 {} | sed "s|^id,|id,history,|g" > {}'.format( local_transactions_csv_path, local_temp_filepath), shell=True) subprocess.run( 'tail -n +2 {} | pv | awk \'{{printf("%d,%s\\n", NR, $0)}}\' >> {}'. format(local_transactions_csv_path, local_temp_filepath), shell=True) # Load data into database container_id = get_db_container(db_name) cmd = 'docker exec -i {} cypher-shell < {}'.format( container_id, os.path.join(data_root, db_name, '{}_neo4j_loader.cypher'.format(db_name))) print(cmd) subprocess.run(cmd, shell=True)
def setUp(self): self.db_info = get_db_info(self.db_name) batch_size = 1 num_workers = 0 max_nodes_per_graph = 100000 _ = get_db_container(self.db_name) train_data, val_data, test_data = get_train_val_test_datasets( dataset_name=self.db_name, train_test_split='use_full_train', encoders=dict(CATEGORICAL='CategoricalOrdinalEnc', SCALAR='ScalarRobustScalerEnc', DATETIME='DatetimeScalarEnc', LATLONG='LatLongScalarEnc', TEXT='TextSummaryScalarEnc'), ) train_loader = get_dataloader( dataset=train_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) val_loader = get_dataloader( dataset=val_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) test_loader = get_dataloader( dataset=test_data, batch_size=batch_size, sampler_class_name='SequentialSampler', num_workers=num_workers, max_nodes_per_graph=max_nodes_per_graph) self.loaders = { 'train': train_loader, 'val': val_loader, 'test': test_loader }
from tqdm import tqdm from data.utils import get_db_container db_names = ('acquirevaluedshopperschallenge', 'homecreditdefaultrisk', 'kddcup2014') if __name__ == '__main__': while True: inp = input('Re-extract dataset info? (y/n): ') if inp in ['y', 'n']: break for db_name in db_names: print(f'Doing {db_name}') _ = get_db_container(db_name) train_dataset, val_dataset, test_dataset = get_train_val_test_datasets( db_name, 'use_full_train') datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } df_graph_info_path = f'./experiments/{db_name}_df_graph_info.pkl' df_node_info_path = f'./experiments/{db_name}_df_node_info.pkl' if inp == 'y': n_nodes = [] n_edges = [] n_in_edges = [] n_out_edges = []