Beispiel #1
0
def split(input_sharded, output_root, info_csv, shuffle_buffer):
    """Split by protein."""
    if input_sharded.get_keys() != ['ensemble']:
        raise RuntimeError('Can only apply to sharded by ensemble.')

    info = pd.read_csv(info_csv)
    info['ensemble'] = info.apply(
        lambda x: x['ligand'] + '__' + x['active_struc'].split('_')[
            2] + '__' + x['inactive_struc'].split('_')[2],
        axis=1)
    info = info.set_index('ensemble')
    # Remove duplicate ensembles.
    info = info[~info.index.duplicated()]

    ensembles = input_sharded.get_names()['ensemble']
    in_use = info.loc[ensembles]
    active = in_use[in_use['label'] == 'A']
    inactive = in_use[in_use['label'] == 'I']

    # Split by protein.
    proteins = info['protein'].unique()
    i_test, i_val, i_train = splits.random_split(len(proteins), 0.6, 0.2, 0.2)
    p_train = proteins[i_train]
    p_val = proteins[i_val]
    p_test = proteins[i_test]
    logger.info(f'Train proteins: {p_train:}')
    logger.info(f'Val proteins: {p_val:}')
    logger.info(f'Test proteins: {p_test:}')

    train = info[info['protein'].isin(p_train)].index.tolist()
    val = info[info['protein'].isin(p_val)].index.tolist()
    test = info[info['protein'].isin(p_test)].index.tolist()

    logger.info(f'{len(train):} train examples, {len(val):} val examples, '
                f'{len(test):} test examples.')

    keys = input_sharded.get_keys()
    prefix = sh.get_prefix(output_root)
    num_shards = sh.get_num_shards(output_root)
    train_sharded = sh.Sharded(f'{prefix:}_train@{num_shards:}', keys)
    val_sharded = sh.Sharded(f'{prefix:}_val@{num_shards:}', keys)
    test_sharded = sh.Sharded(f'{prefix:}_test@{num_shards:}', keys)

    train_filter_fn = filters.form_filter_against_list(train, 'ensemble')
    val_filter_fn = filters.form_filter_against_list(val, 'ensemble')
    test_filter_fn = filters.form_filter_against_list(test, 'ensemble')

    sho.filter_sharded(input_sharded, train_sharded, train_filter_fn,
                       shuffle_buffer)
    sho.filter_sharded(input_sharded, val_sharded, val_filter_fn,
                       shuffle_buffer)
    sho.filter_sharded(input_sharded, test_sharded, test_filter_fn,
                       shuffle_buffer)
Beispiel #2
0
def split_targets_by_year(targets_df, test_years, train_years=None,
                          val_years=None, val_size=0.1, shuffle=True,
                          random_seed=None):
    """
    Split targets for train/val/test based on target released year. All
    targets released during <train_years>/<val_years>/<test_years> are included
    in the train/val/test sets respectively. <test_years> cannot be None;
    otherwise, it will throw an assertion.

    If either <train_years> or <val_years> is None, used the remaining targets
    prior to <test_years> for the other set.

    If both <train_years> and <val_years> are None, all targets prior to the min
    of <test_years> are split randomly as train and val sets, using <val_size>
    as the ratio. <val_size> is a float between 0.0 and 1.0 and represent the
    proportion of the train/val targets to include in the val split.
    """
    # Use targets released prior to <test_year_start> for training/validation,
    # and the rest for testing
    assert test_years is not None
    targets_test = targets_df[targets_df.year.isin(test_years)].target.values

    if train_years is not None and val_years is not None:
        targets_train = targets_df[targets_df.year.isin(train_years)].target.values
        targets_val = targets_df[targets_df.year.isin(val_years)].target.values
        return targets_train, targets_val, targets_test

    test_year_start = min(test_years)
    targets_train_val = targets_df[
        targets_df.year < test_year_start].reset_index(drop=True)

    if train_years is None and val_years is None:
        _, val_indices, train_indices = sp.random_split(
            len(targets_train_val), train_split=None, vali_split=val_size,
            test_split=0, shuffle=shuffle, random_seed=random_seed)
        targets_train = targets_train_val.target.values[train_indices]
        targets_val = targets_train_val.target.values[val_indices]

    elif train_years is not None:
        targets_train = targets_train_val[
            targets_train_val.year.isin(train_years)].target.values
        targets_val = targets_train_val[
            ~targets_train_val.year.isin(train_years)].target.values

    elif val_years is not None:
        targets_val = targets_train_val[
            targets_train_val.year.isin(val_years)].target.values
        targets_train = targets_train_val[
            ~targets_train_val.year.isin(val_years)].target.values

    return targets_train, targets_val, targets_test
Beispiel #3
0
def split_targets_random(targets_df, train_size=None, val_size=0.1,
                         test_size=0.1, shuffle=True, random_seed=None):
    """
    Randomly split targets for train/val/test.
    """
    test_indices, val_indices, train_indices = sp.random_split(
        len(targets_df), train_split=train_size, vali_split=val_size,
        test_split=test_size, shuffle=shuffle, random_seed=random_seed)

    all_targets = targets_df.target.values
    targets_train = all_targets[train_indices]
    targets_val = all_targets[val_indices]
    targets_test = all_targets[val_indices]
    return targets_train, targets_val, targets_test
Beispiel #4
0
def generate_split(excl_uncharacterized=True,
                   excl_rdkitfails=True,
                   out_dir_name='.',
                   seed=42):

    num_molecules = 133885

    # Load the list of molecules to ignore
    if excl_uncharacterized and not excl_rdkitfails:
        unc_file = '../../data/qm9/raw/uncharacterized.txt'
        with open(unc_file, 'r') as f:
            exclude = [int(x.split()[0]) for x in f.read().split('\n')[9:-2]]
        assert len(exclude) == 3054
    elif excl_uncharacterized and excl_rdkitfails:
        exclude = np.loadtxt('../../data/qm9/splits/excl.dat',
                             dtype=int).tolist()
    elif ecl_rdkitfails and not excl_uncharacterized:
        print('Excluding only RDKit fails is not implemented.')
        return
    else:
        exclude = []

    # Define indices to split the data set
    test_indices, vali_indices, train_indices = splits.random_split(
        num_molecules,
        vali_split=0.1,
        test_split=0.1,
        random_seed=seed,
        exclude=exclude)
    print(
        'Training: %i molecules. Validation: %i molecules. Test: %i molecules.'
        % (len(train_indices), len(vali_indices), len(test_indices)))

    # Save the indices for the split
    np.savetxt(out_dir_name + '/indices_test.dat',
               np.sort(test_indices),
               fmt='%1d')
    np.savetxt(out_dir_name + '/indices_valid.dat',
               np.sort(vali_indices),
               fmt='%1d')
    np.savetxt(out_dir_name + '/indices_train.dat',
               np.sort(train_indices),
               fmt='%1d')

    return
Beispiel #5
0
def convert_sdfcsv_to_npz(in_dir_name,
                          out_dir_name,
                          split_indices=None,
                          datatypes=None):
    """Converts a data set given as CSV list and SDF coordinates to npz train/validation/test sets.
        
    Args:
        in_dir_name (str): NAme of the input directory.
        out_dir_name (Str): Name of the output directory.
        split_indices (list): List of int lists [test_indices, vali_indices, train_indices]

    Returns:
        ds (MoleculesDataset): The internal data set with all processed information.
        
    """

    seed = 42

    csv_file = in_dir_name + '/gdb9_with_cv_atom.csv'
    sdf_file = in_dir_name + '/gdb9.sdf'
    unc_file = in_dir_name + '/uncharacterized.txt'

    # Create the internal data set
    ds = MoleculesDataset(csv_file, sdf_file)

    # Load the list of molecules to ignore
    with open(unc_file, 'r') as f:
        exclude = [int(x.split()[0]) for x in f.read().split('\n')[9:-2]]
    assert len(exclude) == 3054

    # Define indices to split the data set
    if split_indices is None:
        test_indices, vali_indices, train_indices = splits.random_split(
            len(ds),
            vali_split=0.1,
            test_split=0.1,
            random_seed=seed,
            exclude=exclude)
    else:
        test_indices, vali_indices, train_indices = split_indices
    print(
        'Training: %i molecules. Validation: %i molecules. Test: %i molecules.'
        % (len(train_indices), len(vali_indices), len(test_indices)))

    # Make a directory
    try:
        os.mkdir(out_dir_name)
    except FileExistsError:
        pass

    # Save the indices for the split
    np.savetxt(out_dir_name + '/indices_test.dat', test_indices, fmt='%1d')
    np.savetxt(out_dir_name + '/indices_valid.dat', vali_indices, fmt='%1d')
    np.savetxt(out_dir_name + '/indices_train.dat', train_indices, fmt='%1d')

    # Save the data sets as compressed numpy files
    test_file_name = out_dir_name + '/test.npz'
    vali_file_name = out_dir_name + '/valid.npz'
    train_file_name = out_dir_name + '/train.npz'
    if len(test_indices) > 0:
        ds.write_compressed(test_file_name,
                            indices=test_indices,
                            datatypes=datatypes)
    if len(vali_indices) > 0:
        ds.write_compressed(vali_file_name,
                            indices=vali_indices,
                            datatypes=datatypes)
    if len(train_indices) > 0:
        ds.write_compressed(train_file_name,
                            indices=train_indices,
                            datatypes=datatypes)

    return ds