def data_prepare_AttentiveFP(task_name, raw_filename, smiles_field, cano_field = 'cano_smiles'): ''' INPUT task_name: user-defined name for the training project raw_filename: a csv file containing smiles and task values of compounds ''' feature_filename = raw_filename.replace('.csv','.pickle') filename = raw_filename.replace('.csv','') prefix_filename = raw_filename.split('/')[-1].replace('.csv','') output_filename = filename + '_processed.csv' print('============== Loading the raw file =====================') smiles_tasks_df = pd.read_csv(raw_filename) smilesList = smiles_tasks_df[smiles_field].values print("number of all smiles: ", len(smilesList)) atom_num_dist = [] remained_smiles = [] canonical_smiles_list = [] for smiles in smilesList: try: mol = Chem.MolFromSmiles(smiles) atom_num_dist.append(len(mol.GetAtoms())) remained_smiles.append(smiles) canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True)) except: print('not successfully processed smiles: ', smiles) pass print("number of successfully processed smiles: ", len(remained_smiles)) smiles_tasks_df = smiles_tasks_df[smiles_tasks_df[smiles_field].isin(remained_smiles)] smiles_tasks_df[cano_field] = canonical_smiles_list assert canonical_smiles_list[8] == Chem.MolToSmiles(Chem.MolFromSmiles(smiles_tasks_df[cano_field][8]), isomericSmiles=True) smiles_tasks_df.to_csv(output_filename, index = None) print('saving processed file as ' + output_filename) print('================== saving feature files ========================') smilesList = [smiles for smiles in canonical_smiles_list if len(Chem.MolFromSmiles(smiles).GetAtoms()) < 151] if os.path.isfile(feature_filename): print('feature file has been generated.') else: feature_dicts = save_smiles_dicts(smilesList, feature_filename) print('saving feature file as ', feature_filename) return atom_num_dist
epochs = 200 p_dropout = 0.5 fingerprint_dim = 100 weight_decay = 5 # also known as l2_regularization_lambda learning_rate = 2.5 radius = 2 T = 2 per_task_output_units_num = 1 # for regression model output_units_num = len(tasks) * per_task_output_units_num if os.path.isfile(feature_filename): feature_dicts = pickle.load(open(feature_filename, "rb")) else: feature_dicts = save_smiles_dicts(smilesList, filename) # feature_dicts = get_smiles_dicts(smilesList) remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin( feature_dicts['smiles_to_atom_mask'].keys())] uncovered_df = smiles_tasks_df.drop(remained_df.index) uncovered_df test_df = remained_df.sample(frac=0.2, random_state=random_seed) train_df = remained_df.drop(test_df.index) train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array( [canonical_smiles_list[0]], feature_dicts) num_atom_features = x_atom.shape[-1] num_bond_features = x_bonds.shape[-1]
print("number of all smiles: ",len(smilesList)) atom_num_dist = [] remained_smiles = [] canonical_smiles_list = [] for smiles in smilesList: try: mol = Chem.MolFromSmiles(smiles) atom_num_dist.append(len(mol.GetAtoms())) remained_smiles.append(smiles) canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True)) except: print(smiles) pass print("number of successfully processed smiles: ", len(remained_smiles)) df["cano_smiles"] = canonical_smiles_list feature_dicts = save_smiles_dicts(smilesList, 'tmp') remained_df = df[df["cano_smiles"].isin(feature_dicts['smiles_to_atom_mask'].keys())] uncovered_idx = set(df.index) - set(remained_df.index) train_idx = set(train_idx) - set(uncovered_idx) valid_idx = set(valid_idx) - set(uncovered_idx) print(len(train_idx), len(valid_idx)) train_df = remained_df.loc[train_idx].reset_index(drop=True) valid_df = remained_df.loc[valid_idx].reset_index(drop=True) x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array([canonical_smiles_list[0]],feature_dicts) num_atom_features = x_atom.shape[-1] num_bond_features = x_bonds.shape[-1] loss_function = nn.MSELoss() model = Fingerprint(radius, T, num_atom_features, num_bond_features, fingerprint_dim, output_units_num, p_dropout)