def load_dataset(args, df): dataset = MoleculeCSVDataset(df=df, smiles_to_graph=partial(smiles_to_bigraph, add_self_loop=True), node_featurizer=args['node_featurizer'], edge_featurizer=args['edge_featurizer'], smiles_column=args['smiles_column'], cache_file_path=args['result_path'] + '/graph.bin', task_names=args['task_names'], n_jobs=args['num_workers']) return dataset
def load_dataset(args, df): if args['model'] in [ 'gin_supervised_contextpred', 'gin_supervised_infomax', 'gin_supervised_edgepred', 'gin_supervised_masking' ]: self_loop = True else: self_loop = False dataset = MoleculeCSVDataset( df=df, smiles_to_graph=partial(smiles_to_bigraph, add_self_loop=self_loop), node_featurizer=args['node_featurizer'], edge_featurizer=args['edge_featurizer'], smiles_column=args['smiles_column'], cache_file_path=args['result_path'] + '/graph.bin', task_names=args['task_names']) return dataset
if torch.cuda.is_available(): args['device'] = torch.device('cuda:0') else: args['device'] = torch.device('cpu') if args['task_names'] is not None: args['task_names'] = args['task_names'].split(',') args['node_featurizer'] = CanonicalAtomFeaturizer() df = pd.read_csv(args['csv_path']) mkdir_p(args['result_path']) dataset = MoleculeCSVDataset(df=df, smiles_to_graph=smiles_to_bigraph, node_featurizer=args['node_featurizer'], edge_featurizer=None, smiles_column=args['smiles_column'], cache_file_path=args['result_path'] + '/graph.bin', task_names=args['task_names']) args['n_tasks'] = dataset.n_tasks train_set, val_set, test_set = split_dataset(args, dataset) if args['num_evals'] is not None: assert args['num_evals'] > 0, 'Expect the number of hyperparameter search trials to ' \ 'be greater than 0, got {:d}'.format(args['num_evals']) print('Start hyperparameter search with Bayesian ' 'optimization for {:d} trials'.format(args['num_evals'])) trial_path = bayesian_optimization(args, train_set, val_set, test_set) else: print('Use the manually specified hyperparameters') exp_config = get_configure(args['model'])
'all the columns except for the smiles_column in the CSV file. ' '(default: None)') args = parser.parse_args().__dict__ args['exp_name'] = '_'.join([args['model'], args['mode']]) if args['tasks'] is not None: args['tasks'] = args['tasks'].split(',') args.update(configs[args['exp_name']]) # Setup for experiments mkdir_p(args['result_path']) node_featurizer = atom_featurizer edge_featurizer = CanonicalBondFeaturizer(bond_data_field='he', self_loop=True) df = pd.read_csv(args['csv_path']) dataset = MoleculeCSVDataset( df, partial(smiles_to_bigraph, add_self_loop=True), node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column=args['smiles_column'], cache_file_path=args['result_path'] + '/graph.bin', task_names=args['tasks']) args['tasks'] = dataset.task_names args = setup(args) train_set, val_set, test_set = RandomSplitter.train_val_test_split( dataset, frac_train=0.8, frac_val=0.1, frac_test=0.1, random_state=0) main(args, node_featurizer, edge_featurizer, train_set, val_set, test_set)