def data_prepare_AttentiveFP(task_name, raw_filename, smiles_field,
                            cano_field = 'cano_smiles'):
    '''
    INPUT
        task_name: user-defined name for the training project
        raw_filename: a csv file containing smiles and task values of compounds
    '''
    feature_filename = raw_filename.replace('.csv','.pickle')
    filename = raw_filename.replace('.csv','')
    prefix_filename = raw_filename.split('/')[-1].replace('.csv','')
    output_filename = filename + '_processed.csv'
    
    print('============== Loading the raw file =====================')
    smiles_tasks_df = pd.read_csv(raw_filename)
    smilesList = smiles_tasks_df[smiles_field].values
    print("number of all smiles: ", len(smilesList))
    
    atom_num_dist = []
    remained_smiles = []
    canonical_smiles_list = []
    for smiles in smilesList:
        try:
            mol = Chem.MolFromSmiles(smiles)
            atom_num_dist.append(len(mol.GetAtoms()))
            remained_smiles.append(smiles)
            canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True))
        except:
            print('not successfully processed smiles: ', smiles)
            pass
    print("number of successfully processed smiles: ", len(remained_smiles))
    smiles_tasks_df = smiles_tasks_df[smiles_tasks_df[smiles_field].isin(remained_smiles)]
    smiles_tasks_df[cano_field] = canonical_smiles_list
    assert canonical_smiles_list[8] == Chem.MolToSmiles(Chem.MolFromSmiles(smiles_tasks_df[cano_field][8]), 
                                                        isomericSmiles=True)
    smiles_tasks_df.to_csv(output_filename, index = None)
    print('saving processed file as ' + output_filename)

    print('================== saving feature files ========================')
    smilesList = [smiles for smiles in canonical_smiles_list if len(Chem.MolFromSmiles(smiles).GetAtoms()) < 151]
    if os.path.isfile(feature_filename):
        print('feature file has been generated.')
    else:
        feature_dicts = save_smiles_dicts(smilesList, feature_filename)
        print('saving feature file as ', feature_filename)
    return atom_num_dist
Beispiel #2
0
epochs = 200

p_dropout = 0.5
fingerprint_dim = 100

weight_decay = 5  # also known as l2_regularization_lambda
learning_rate = 2.5
radius = 2
T = 2
per_task_output_units_num = 1  # for regression model
output_units_num = len(tasks) * per_task_output_units_num

if os.path.isfile(feature_filename):
    feature_dicts = pickle.load(open(feature_filename, "rb"))
else:
    feature_dicts = save_smiles_dicts(smilesList, filename)
# feature_dicts = get_smiles_dicts(smilesList)
remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin(
    feature_dicts['smiles_to_atom_mask'].keys())]
uncovered_df = smiles_tasks_df.drop(remained_df.index)
uncovered_df

test_df = remained_df.sample(frac=0.2, random_state=random_seed)
train_df = remained_df.drop(test_df.index)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(
    [canonical_smiles_list[0]], feature_dicts)
num_atom_features = x_atom.shape[-1]
num_bond_features = x_bonds.shape[-1]
Beispiel #3
0
    print("number of all smiles: ",len(smilesList))
    atom_num_dist = []
    remained_smiles = []
    canonical_smiles_list = []
    for smiles in smilesList:
        try:        
            mol = Chem.MolFromSmiles(smiles)
            atom_num_dist.append(len(mol.GetAtoms()))
            remained_smiles.append(smiles)
            canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True))
        except:
            print(smiles)
            pass
    print("number of successfully processed smiles: ", len(remained_smiles))
    df["cano_smiles"] = canonical_smiles_list
    feature_dicts = save_smiles_dicts(smilesList, 'tmp')
    remained_df = df[df["cano_smiles"].isin(feature_dicts['smiles_to_atom_mask'].keys())]
    uncovered_idx = set(df.index) - set(remained_df.index)
    train_idx = set(train_idx) - set(uncovered_idx)
    valid_idx = set(valid_idx) - set(uncovered_idx)
    print(len(train_idx), len(valid_idx))
    
    train_df = remained_df.loc[train_idx].reset_index(drop=True)
    valid_df = remained_df.loc[valid_idx].reset_index(drop=True)

    x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array([canonical_smiles_list[0]],feature_dicts)
    num_atom_features = x_atom.shape[-1]
    num_bond_features = x_bonds.shape[-1]
    loss_function = nn.MSELoss()
    model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                fingerprint_dim, output_units_num, p_dropout)