def split_indices(all_indices: List[int], num_folds: int, scaffold: bool = False, data: MoleculeDataset = None, shuffle: bool = True) -> List[List[int]]: num_data = len(all_indices) if scaffold: scaffold_to_indices = scaffold_to_smiles(data.mols(), use_indices=True) index_sets = sorted(list(scaffold_to_indices.values()), key=lambda index_set: len(index_set), reverse=True) fold_indices = [[] for _ in range(num_folds)] for s in index_sets: length_array = [len(fi) for fi in fold_indices] min_index = length_array.index(min(length_array)) fold_indices[min_index] += s if shuffle: random.shuffle(fold_indices) else: # random if shuffle: random.shuffle(all_indices) fold_indices = [] for i in range(num_folds): begin, end = int(i * num_data / num_folds), int( (i + 1) * num_data / num_folds) fold_indices.append(np.array(all_indices[begin:end])) return fold_indices
def split_indices(all_indices, num_folds, data, shuffle=True): num_data = len(all_indices) scaffold_to_indices = scaffold_to_smiles(data.mols(flatten=True), use_indices=True) print(len(scaffold_to_indices)) print(scaffold_to_indices) index_sets = sorted(list(scaffold_to_indices.values()), key=lambda index_set: len(index_set), reverse=True) fold_indices = [[] for _ in range(num_folds)] for s in index_sets: length_array = [len(fi) for fi in fold_indices] min_index = length_array.index(min(length_array)) fold_indices[min_index] += s if shuffle: random.shuffle(fold_indices) return fold_indices
def scaffold_similarity(smiles_1: List[str], smiles_2: List[str]): """ Determines the similarity between the scaffolds of two lists of smiles strings. :param smiles_1: A list of smiles strings. :param smiles_2: A list of smiles strings. """ # Get scaffolds scaffold_to_smiles_1 = scaffold_to_smiles(smiles_1) scaffold_to_smiles_2 = scaffold_to_smiles(smiles_2) scaffolds_1, smiles_sets_1 = zip(*scaffold_to_smiles_1.items()) scaffolds_2, smiles_sets_2 = zip(*scaffold_to_smiles_2.items()) smiles_to_scaffold = { smiles: scaffold for scaffold, smiles_set in scaffold_to_smiles_1.items() for smiles in smiles_set } smiles_to_scaffold.update({ smiles: scaffold for scaffold, smiles_set in scaffold_to_smiles_2.items() for smiles in smiles_set }) # Determine similarity scaffolds_1, scaffolds_2 = set(scaffolds_1), set(scaffolds_2) smiles_1, smiles_2 = set(smiles_1), set(smiles_2) all_scaffolds = scaffolds_1 | scaffolds_2 all_smiles = smiles_1 | smiles_2 scaffolds_intersection = scaffolds_1 & scaffolds_2 # smiles_intersection is smiles with a scaffold that appears in both datasets smiles_intersection = { smiles for smiles in all_smiles if smiles_to_scaffold[smiles] in scaffolds_intersection } smiles_in_1_with_scaffold_in_2 = { smiles for smiles in smiles_1 if smiles_to_scaffold[smiles] in scaffolds_2 } smiles_in_2_with_scaffold_in_1 = { smiles for smiles in smiles_2 if smiles_to_scaffold[smiles] in scaffolds_1 } sizes_1 = np.array([len(smiles_set) for smiles_set in smiles_sets_1]) sizes_2 = np.array([len(smiles_set) for smiles_set in smiles_sets_2]) # Print results print() print(f'Number of molecules = {len(all_smiles):,}') print(f'Number of scaffolds = {len(all_scaffolds):,}') print() print( f'Number of scaffolds in both datasets = {len(scaffolds_intersection):,}' ) print( f'Scaffold intersection over union = {len(scaffolds_intersection) / len(all_scaffolds):.4f}' ) print() print( f'Number of molecules with scaffold in both datasets = {len(smiles_intersection):,}' ) print( f'Molecule intersection over union = {len(smiles_intersection) / len(all_smiles):.4f}' ) print() print(f'Number of molecules in dataset 1 = {np.sum(sizes_1):,}') print(f'Number of scaffolds in dataset 1 = {len(scaffolds_1):,}') print() print(f'Number of molecules in dataset 2 = {np.sum(sizes_2):,}') print(f'Number of scaffolds in dataset 2 = {len(scaffolds_2):,}') print() print( f'Percent of scaffolds in dataset 1 which are also in dataset 2 = {100 * len(scaffolds_intersection) / len(scaffolds_1):.2f}%' ) print( f'Percent of scaffolds in dataset 2 which are also in dataset 1 = {100 * len(scaffolds_intersection) / len(scaffolds_2):.2f}%' ) print() print( f'Number of molecules in dataset 1 with scaffolds in dataset 2 = {len(smiles_in_1_with_scaffold_in_2):,}' ) print( f'Percent of molecules in dataset 1 with scaffolds in dataset 2 = {100 * len(smiles_in_1_with_scaffold_in_2) / len(smiles_1):.2f}%' ) print() print( f'Number of molecules in dataset 2 with scaffolds in dataset 1 = {len(smiles_in_2_with_scaffold_in_1):,}' ) print( f'Percent of molecules in dataset 2 with scaffolds in dataset 1 = {100 * len(smiles_in_2_with_scaffold_in_1) / len(smiles_2):.2f}%' ) print() print( f'Average number of molecules per scaffold in dataset 1 = {np.mean(sizes_1):.4f} +/- {np.std(sizes_1):.4f}' ) print('Percentiles for molecules per scaffold in dataset 1') print(' | '.join([ f'{i}% = {int(np.percentile(sizes_1, i)):,}' for i in range(0, 101, 10) ])) print() print( f'Average number of molecules per scaffold in dataset 2 = {np.mean(sizes_2):.4f} +/- {np.std(sizes_2):.4f}' ) print('Percentiles for molecules per scaffold in dataset 2') print(' | '.join([ f'{i}% = {int(np.percentile(sizes_2, i)):,}' for i in range(0, 101, 10) ]))
def scaffold_split_num_pos(data_path: str, max_scaffold_size_in_test: int, num_pos_in_test: int, percent_neg_in_test: float, save_dir: str): # Load data data = pd.read_csv(data_path) mols = [Chem.MolFromSmiles(smiles) for smiles in data['smiles']] # Determine scaffolds scaffold_to_indices: Dict[str, Set[int]] = scaffold_to_smiles(mols, use_indices=True) scaffold_to_indices = { scaffold: sorted(indices) for scaffold, indices in scaffold_to_indices.items() } # Split scaffolds into those will all positive, all negative, or mixed activity pos_scaffolds, mix_scaffolds, neg_scaffolds = [], [], [] for scaffold, indices in scaffold_to_indices.items(): activities = {data.iloc[index]['activity'] for index in indices} if activities == {1}: pos_scaffolds.append(scaffold) elif activities == {0}: neg_scaffolds.append(scaffold) elif activities == {0, 1}: mix_scaffolds.append(scaffold) else: raise ValueError( f'Found activities "{activities}" but should only be 0 or 1') # Reproducibility random.seed(0) pos_scaffolds, mix_scaffolds, neg_scaffolds = sorted( pos_scaffolds), sorted(mix_scaffolds), sorted(neg_scaffolds) # Get small scaffolds small_pos_scaffolds = [ scaffold for scaffold in pos_scaffolds if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test ] small_mix_scaffolds = [ scaffold for scaffold in mix_scaffolds if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test ] small_neg_scaffolds = [ scaffold for scaffold in neg_scaffolds if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test ] # Put all big scaffolds in train train_scaffolds = sorted( set.union( set(pos_scaffolds) - set(small_pos_scaffolds), set(mix_scaffolds) - set(small_mix_scaffolds), set(neg_scaffolds) - set(small_neg_scaffolds))) test_scaffolds = [] # Mixed scaffolds (half in train, half in test) random.shuffle(small_mix_scaffolds) half = len(small_mix_scaffolds) // 2 train_scaffolds += small_mix_scaffolds[:half] test_scaffolds += small_mix_scaffolds[half:] # Positive scaffolds (put in test until hit num_pos_in_test, rest in train) random.shuffle(small_pos_scaffolds) test_indices = sum( (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), []) num_pos = sum(data.iloc[test_indices]['activity']) for scaffold in small_pos_scaffolds: scaffold_size = len(scaffold_to_indices[scaffold]) if num_pos < num_pos_in_test and scaffold_size <= (num_pos_in_test - num_pos): test_scaffolds.append(scaffold) num_pos += scaffold_size else: train_scaffolds.append(scaffold) # Negative scaffolds (put in test until hit percent_neg_in_test, rest in train) random.shuffle(small_neg_scaffolds) test_indices = sum( (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), []) num_neg_in_test = int(percent_neg_in_test * sum(data['activity'] == 0)) num_neg = sum(data.iloc[test_indices]['activity']) for scaffold in small_neg_scaffolds: scaffold_size = len(scaffold_to_indices[scaffold]) if num_neg < num_neg_in_test and scaffold_size <= (num_neg_in_test - num_neg): test_scaffolds.append(scaffold) num_neg += scaffold_size else: train_scaffolds.append(scaffold) # Get indices train_indices = sum( (scaffold_to_indices[scaffold] for scaffold in train_scaffolds), []) test_indices = sum( (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), []) # Checks train_scaffolds_set, test_scaffolds_set = set(train_scaffolds), set( test_scaffolds) assert len(train_scaffolds_set & test_scaffolds_set) == 0 assert set.union(train_scaffolds_set, test_scaffolds_set) == set(scaffold_to_indices.keys()) train_indices_set, test_indices_set = set(train_indices), set(test_indices) assert len(train_indices_set & test_indices_set) == 0 assert set.union(train_indices_set, test_indices_set) == set(range(len(data))) # Shuffle test random.shuffle(test_indices) # Split data train, test = data.iloc[train_indices], data.iloc[test_indices] # Print statistics print('train') print(train['activity'].value_counts()) print('test') print(test['activity'].value_counts()) # Shuffle test # Save scaffolds makedirs(save_dir) train.to_csv(os.path.join(save_dir, 'train.csv'), index=False) test.to_csv(os.path.join(save_dir, 'test.csv'), index=False)