def test_scheme_C2(): # Need to test it as a learning curve, not just pick one num = 3 atom_df = pd.read_pickle('tests/test_mols/atoms.pkl') pair_df = pd.read_pickle('tests/test_mols/pairs.pkl') mol_df, graphs = graphin.make_graph_df(atom_df, pair_df) total = len(graphs) counter = 0 chosen = [] while len(chosen) < len(graphs) - num: # Iterations continue until there are not enough graphs to select # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test # Make the selection the same each time, we just want to select chosen from the df # We then just grow chosen each time using the selection scheme counter += 1 chosen = select_molecules_C2(mol_df, atom_df, pair_df, prev_chosen=chosen, num=num) train_graphs, train_mol_df, test_graphs, test_mol_df = get_split( chosen, mol_df, graphs) assert len(train_graphs) == num * counter assert len(test_graphs) == total - (num * counter) assert len(train_mol_df.molecule_name.unique()) == len( train_mol_df.molecule_name) assert len(test_mol_df.molecule_name.unique()) == len( test_mol_df.molecule_name) assert counter < total / num for molname in train_mol_df.molecule_name.unique(): assert not molname in test_mol_df.molecule_name.unique() for molname1 in train_mol_df.molecule_name: mol_df1 = atom_df.loc[(atom_df.molecule_name == molname1)]["conn"] for molname2 in test_mol_df.molecule_name: mol_df2 = atom_df.loc[( atom_df.molecule_name == molname1)]["conn"] count1 = [bond for conn in mol_df1.values for bond in conn].count(2) count2 = [bond for conn in mol_df2.values for bond in conn].count(2) assert count1 >= count2
def test_scheme_D6(): num = 3 atom_df = pd.read_pickle('tests/test_mols/atoms.pkl') pair_df = pd.read_pickle('tests/test_mols/pairs.pkl') mol_df, graphs = graphin.make_graph_df(atom_df, pair_df) total = len(graphs) counter = 0 chosen = [] while len(chosen) < len(graphs) - num: # Iterations continue until there are not enough graphs to select # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test # Make the selection the same each time, we just want to select chosen from the df # We then just grow chosen each time using the selection scheme counter += 1 chosen = select_molecules_D6(mol_df, atom_df, pair_df, prev_chosen=chosen, num=num) train_graphs, train_mol_df, test_graphs, test_mol_df = get_split( chosen, mol_df, graphs) assert len(train_graphs) == num * counter assert len(test_graphs) == total - (num * counter) assert len(train_mol_df.molecule_name.unique()) == len( train_mol_df.molecule_name) assert len(test_mol_df.molecule_name.unique()) == len( test_mol_df.molecule_name) assert counter < total / num for molname1 in train_mol_df.molecule_name: mol_df1 = atom_df.loc[( atom_df.molecule_name == molname1)]["typestr"] count1 = 0 for type in mol_df1.values: if type not in ['H', 'C', 'N', 'O', 'F']: count1 += 1 for molname2 in test_mol_df.molecule_name: mol_df2 = atom_df.loc[( atom_df.molecule_name == molname2)]["typestr"] count2 = 0 for type in mol_df2.values: if type not in ['H', 'C', 'N', 'O', 'F']: count2 += 1 assert count1 >= count2
def test_scheme_B2(): # Need to test it as a learning curve, not just pick one num = 3 atom_df = pd.read_pickle('tests/test_mols/atoms.pkl') pair_df = pd.read_pickle('tests/test_mols/pairs.pkl') mol_df, graphs = graphin.make_graph_df(atom_df, pair_df) total = len(graphs) counter = 0 chosen = [] while len(chosen) < len(graphs) - num: # Iterations continue until there are not enough graphs to select # Each iteration we want (num*n) graphs in train, and (total-(num*n)) graphs in test # Make the selection the same each time, we just want to select chosen from the df # We then just grow chosen each time using the selection scheme counter += 1 chosen = select_molecules_B2(mol_df, atom_df, pair_df, prev_chosen=chosen, num=num) train_graphs, train_mol_df, test_graphs, test_mol_df = get_split( chosen, mol_df, graphs) assert len(train_graphs) == num * counter assert len(test_graphs) == total - (num * counter) assert len(train_mol_df.molecule_name.unique()) == len( train_mol_df.molecule_name) assert len(test_mol_df.molecule_name.unique()) == len( test_mol_df.molecule_name) assert counter < total / num for molname in train_mol_df.molecule_name.unique(): assert not molname in test_mol_df.molecule_name.unique() for graph1 in train_graphs: for graph2 in test_graphs: assert graph1.number_of_nodes() <= graph2.number_of_nodes()
def test_scheme_I7(): num = 3 atom_df = pd.read_pickle('tests/test_mols/atoms.pkl') pair_df = pd.read_pickle('tests/test_mols/pairs.pkl') mol_df, graphs = graphin.make_graph_df(atom_df, pair_df) total = len(graphs) counter = 0 chosen = [] while len(chosen) < len(graphs) - num: atom_df = add_randomised_FEPs(atom_df) atom_df = add_randomised_FEP_vars(atom_df) counter += 1 chosen = select_molecules_I7(mol_df, atom_df, pair_df, prev_chosen=chosen, num=num) train_graphs, train_mol_df, test_graphs, test_mol_df = get_split( chosen, mol_df, graphs) assert len(train_graphs) == num * counter, print( len(train_graphs), num, counter) assert len(test_graphs) == total - (num * counter) assert len(train_mol_df.molecule_name.unique()) == len( train_mol_df.molecule_name) assert len(test_mol_df.molecule_name.unique()) == len( test_mol_df.molecule_name) assert counter < total / num for molname in train_mol_df.molecule_name.unique(): assert not molname in test_mol_df.molecule_name.unique() assert counter == 2 assert len(chosen) == len(graphs) - len(graphs) % 3