def split(input_sharded, output_root, info_csv, shuffle_buffer): """Split by protein.""" if input_sharded.get_keys() != ['ensemble']: raise RuntimeError('Can only apply to sharded by ensemble.') info = pd.read_csv(info_csv) info['ensemble'] = info.apply( lambda x: x['ligand'] + '__' + x['active_struc'].split('_')[ 2] + '__' + x['inactive_struc'].split('_')[2], axis=1) info = info.set_index('ensemble') # Remove duplicate ensembles. info = info[~info.index.duplicated()] ensembles = input_sharded.get_names()['ensemble'] in_use = info.loc[ensembles] active = in_use[in_use['label'] == 'A'] inactive = in_use[in_use['label'] == 'I'] # Split by protein. proteins = info['protein'].unique() i_test, i_val, i_train = splits.random_split(len(proteins), 0.6, 0.2, 0.2) p_train = proteins[i_train] p_val = proteins[i_val] p_test = proteins[i_test] logger.info(f'Train proteins: {p_train:}') logger.info(f'Val proteins: {p_val:}') logger.info(f'Test proteins: {p_test:}') train = info[info['protein'].isin(p_train)].index.tolist() val = info[info['protein'].isin(p_val)].index.tolist() test = info[info['protein'].isin(p_test)].index.tolist() logger.info(f'{len(train):} train examples, {len(val):} val examples, ' f'{len(test):} test examples.') keys = input_sharded.get_keys() prefix = sh.get_prefix(output_root) num_shards = sh.get_num_shards(output_root) train_sharded = sh.Sharded(f'{prefix:}_train@{num_shards:}', keys) val_sharded = sh.Sharded(f'{prefix:}_val@{num_shards:}', keys) test_sharded = sh.Sharded(f'{prefix:}_test@{num_shards:}', keys) train_filter_fn = filters.form_filter_against_list(train, 'ensemble') val_filter_fn = filters.form_filter_against_list(val, 'ensemble') test_filter_fn = filters.form_filter_against_list(test, 'ensemble') sho.filter_sharded(input_sharded, train_sharded, train_filter_fn, shuffle_buffer) sho.filter_sharded(input_sharded, val_sharded, val_filter_fn, shuffle_buffer) sho.filter_sharded(input_sharded, test_sharded, test_filter_fn, shuffle_buffer)
def split_targets_by_year(targets_df, test_years, train_years=None, val_years=None, val_size=0.1, shuffle=True, random_seed=None): """ Split targets for train/val/test based on target released year. All targets released during <train_years>/<val_years>/<test_years> are included in the train/val/test sets respectively. <test_years> cannot be None; otherwise, it will throw an assertion. If either <train_years> or <val_years> is None, used the remaining targets prior to <test_years> for the other set. If both <train_years> and <val_years> are None, all targets prior to the min of <test_years> are split randomly as train and val sets, using <val_size> as the ratio. <val_size> is a float between 0.0 and 1.0 and represent the proportion of the train/val targets to include in the val split. """ # Use targets released prior to <test_year_start> for training/validation, # and the rest for testing assert test_years is not None targets_test = targets_df[targets_df.year.isin(test_years)].target.values if train_years is not None and val_years is not None: targets_train = targets_df[targets_df.year.isin(train_years)].target.values targets_val = targets_df[targets_df.year.isin(val_years)].target.values return targets_train, targets_val, targets_test test_year_start = min(test_years) targets_train_val = targets_df[ targets_df.year < test_year_start].reset_index(drop=True) if train_years is None and val_years is None: _, val_indices, train_indices = sp.random_split( len(targets_train_val), train_split=None, vali_split=val_size, test_split=0, shuffle=shuffle, random_seed=random_seed) targets_train = targets_train_val.target.values[train_indices] targets_val = targets_train_val.target.values[val_indices] elif train_years is not None: targets_train = targets_train_val[ targets_train_val.year.isin(train_years)].target.values targets_val = targets_train_val[ ~targets_train_val.year.isin(train_years)].target.values elif val_years is not None: targets_val = targets_train_val[ targets_train_val.year.isin(val_years)].target.values targets_train = targets_train_val[ ~targets_train_val.year.isin(val_years)].target.values return targets_train, targets_val, targets_test
def split_targets_random(targets_df, train_size=None, val_size=0.1, test_size=0.1, shuffle=True, random_seed=None): """ Randomly split targets for train/val/test. """ test_indices, val_indices, train_indices = sp.random_split( len(targets_df), train_split=train_size, vali_split=val_size, test_split=test_size, shuffle=shuffle, random_seed=random_seed) all_targets = targets_df.target.values targets_train = all_targets[train_indices] targets_val = all_targets[val_indices] targets_test = all_targets[val_indices] return targets_train, targets_val, targets_test
def generate_split(excl_uncharacterized=True, excl_rdkitfails=True, out_dir_name='.', seed=42): num_molecules = 133885 # Load the list of molecules to ignore if excl_uncharacterized and not excl_rdkitfails: unc_file = '../../data/qm9/raw/uncharacterized.txt' with open(unc_file, 'r') as f: exclude = [int(x.split()[0]) for x in f.read().split('\n')[9:-2]] assert len(exclude) == 3054 elif excl_uncharacterized and excl_rdkitfails: exclude = np.loadtxt('../../data/qm9/splits/excl.dat', dtype=int).tolist() elif ecl_rdkitfails and not excl_uncharacterized: print('Excluding only RDKit fails is not implemented.') return else: exclude = [] # Define indices to split the data set test_indices, vali_indices, train_indices = splits.random_split( num_molecules, vali_split=0.1, test_split=0.1, random_seed=seed, exclude=exclude) print( 'Training: %i molecules. Validation: %i molecules. Test: %i molecules.' % (len(train_indices), len(vali_indices), len(test_indices))) # Save the indices for the split np.savetxt(out_dir_name + '/indices_test.dat', np.sort(test_indices), fmt='%1d') np.savetxt(out_dir_name + '/indices_valid.dat', np.sort(vali_indices), fmt='%1d') np.savetxt(out_dir_name + '/indices_train.dat', np.sort(train_indices), fmt='%1d') return
def convert_sdfcsv_to_npz(in_dir_name, out_dir_name, split_indices=None, datatypes=None): """Converts a data set given as CSV list and SDF coordinates to npz train/validation/test sets. Args: in_dir_name (str): NAme of the input directory. out_dir_name (Str): Name of the output directory. split_indices (list): List of int lists [test_indices, vali_indices, train_indices] Returns: ds (MoleculesDataset): The internal data set with all processed information. """ seed = 42 csv_file = in_dir_name + '/gdb9_with_cv_atom.csv' sdf_file = in_dir_name + '/gdb9.sdf' unc_file = in_dir_name + '/uncharacterized.txt' # Create the internal data set ds = MoleculesDataset(csv_file, sdf_file) # Load the list of molecules to ignore with open(unc_file, 'r') as f: exclude = [int(x.split()[0]) for x in f.read().split('\n')[9:-2]] assert len(exclude) == 3054 # Define indices to split the data set if split_indices is None: test_indices, vali_indices, train_indices = splits.random_split( len(ds), vali_split=0.1, test_split=0.1, random_seed=seed, exclude=exclude) else: test_indices, vali_indices, train_indices = split_indices print( 'Training: %i molecules. Validation: %i molecules. Test: %i molecules.' % (len(train_indices), len(vali_indices), len(test_indices))) # Make a directory try: os.mkdir(out_dir_name) except FileExistsError: pass # Save the indices for the split np.savetxt(out_dir_name + '/indices_test.dat', test_indices, fmt='%1d') np.savetxt(out_dir_name + '/indices_valid.dat', vali_indices, fmt='%1d') np.savetxt(out_dir_name + '/indices_train.dat', train_indices, fmt='%1d') # Save the data sets as compressed numpy files test_file_name = out_dir_name + '/test.npz' vali_file_name = out_dir_name + '/valid.npz' train_file_name = out_dir_name + '/train.npz' if len(test_indices) > 0: ds.write_compressed(test_file_name, indices=test_indices, datatypes=datatypes) if len(vali_indices) > 0: ds.write_compressed(vali_file_name, indices=vali_indices, datatypes=datatypes) if len(train_indices) > 0: ds.write_compressed(train_file_name, indices=train_indices, datatypes=datatypes) return ds