def read_hasc(self, data_path, n_channels, n_steps, target, types, n_class, issample=True): X = np.empty((0, n_steps, n_channels)) Y = list() for index in range(self.process_num): store_data_file = "%s/split_data%d.npy" % (data_path, index) store_label_file = "%s/split_label%d.npy" % (data_path, index) temp_data = np.load(store_data_file) X = np.vstack([X, temp_data]) temp_label = np.load(store_label_file) temp_label = self.map2id(temp_label, target, types) Y.extend(temp_label) Y = np.array(Y) if target == 'binary': if issample == True: X, Y = sample(X, Y) elif target == 'one-class': X, Y = one_class_sample(X, Y) Y = np.asarray(pd.get_dummies(np.array(Y)), dtype=np.int8) logging.info("Label has %d class" % Y[0].shape[0]) assert Y[0].shape[0] == n_class return X, Y
def batch_gen(self, batch_size): while True: if self.record_point == 0: self.entity1 = np.zeros(self.record_num, dtype=np.int32) self.entity2 = np.zeros(self.record_num, dtype=np.int32) self.entity3 = np.zeros(self.record_num, dtype=np.int32) self.entity4 = np.zeros(self.record_num, dtype=np.int32) self.relation = np.zeros(self.record_num, dtype=np.int32) index = range(self.record_num) random.shuffle(index) kneg_ind = 0 if self.kneg: smp_ind = sample(2 * self.record_num, self.w_cnt) for i in index: if self.kneg: j = smp_ind[kneg_ind % (2 * self.record_num)] kneg_ind += 1 else: j = random.randint(0, self.entity_num - 1) if self.version == 'bern': pr = self.r_p[self.fb_r[i]] else: pr = 0.5 tmp_r = self.fb_r[i] if random.random() < pr: if self.center and random.random() < 0.4: j = j % len(self.r_e_t[tmp_r]) j = self.r_e_t[tmp_r][j] elif self.asym and random.random() < 0.05: j = self.fb_h[i] while (self.fb_h[i], self.fb_r[i], j) in self.ok: if self.center and random.random() < 0.4: j = random.randint(0, len(self.r_e_t[tmp_r]) - 1) j = self.r_e_t[tmp_r][j] else: if self.kneg: j = smp_ind[kneg_ind % (2 * self.record_num)] kneg_ind += 1 else: j = random.randint(0, self.entity_num - 1) self.entity1[i] = self.fb_h[i] self.entity2[i] = self.fb_l[i] self.entity3[i] = self.fb_h[i] self.entity4[i] = j self.relation[i] = self.fb_r[i] else: if self.center and random.random() < 0.4: j = j % len(self.r_e_h[tmp_r]) j = self.r_e_h[tmp_r][j] elif self.asym and random.random() < 0.05: j = self.fb_l[i] while (j, self.fb_r[i], self.fb_l[i]) in self.ok: if self.center and random.random() < 0.4: j = random.randint(0, len(self.r_e_h[tmp_r]) - 1) j = self.r_e_h[tmp_r][j] else: if self.kneg: j = smp_ind[kneg_ind % (2 * self.record_num)] kneg_ind += 1 else: j = random.randint(0, self.entity_num - 1) self.entity1[i] = self.fb_h[i] self.entity2[i] = self.fb_l[i] self.entity3[i] = j self.entity4[i] = self.fb_l[i] self.relation[i] = self.fb_r[i] end = self.record_point + batch_size ''' if end < self.record_num: ind = range(self.record_point,end) self.record_point += batch_size else: ind = range(self.record_point,self.record_num) ind.extend(range(end-self.record_num)) self.record_point = 0 ''' ind = range(self.record_point, end) self.record_point += batch_size if end + batch_size > self.record_num: self.record_point = 0 entity1 = self.entity1[ind] entity2 = self.entity2[ind] entity3 = self.entity3[ind] entity4 = self.entity4[ind] relation = self.relation[ind] yield entity1, entity2, entity3, entity4, relation
def train_tox21(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, force_save=False, special_config: dict = None, position_encoder_path: str = 'net/pe.pt', dataset='TOX21', tag='std'): cfg = DEFAULT_CONFIG.copy() if special_config: cfg.update(special_config) for k, v in cfg.items(): print(k, ':', v) set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) if dataset == 'TOX21': smiles, info_list, properties = load_tox21(limit) graph_path = TOX21_GRAPH_PATH else: assert False, "Unknown dataset: {}.".format(dataset) n_label = properties.shape[-1] is_nan = np.isnan(properties) properties[is_nan] = 0.0 not_nan = np.logical_not(is_nan).astype(np.float) not_nan_mask = not_nan.astype(np.int) not_nan = torch.tensor(not_nan, dtype=torch.float32) properties = torch.tensor(properties, dtype=torch.float32) if use_cuda: not_nan = not_nan.cuda() properties = properties.cuda() molecules = [ HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list ] n_dim = molecules[0].n_dim e_dim = molecules[0].e_dim node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), cfg['TRAIN_PER'], cfg['VALIDATE_PER'], cfg['TEST_PER']) n_seg = int(len(train_mask) / (cfg['BATCH'] + 1)) train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1)) validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(test_mask) / (cfg['BATCH'] + 1)) test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)] print(train_mask, validate_mask, test_mask) print(len(train_mask_list), len(validate_mask_list), len(test_mask_list)) if position_encoder_path and os.path.exists(position_encoder_path): position_encoder = torch.load(position_encoder_path) position_encoder.eval() else: print('NO POSITION ENCODER IS BEING USED!!!') position_encoder = None model = AMPNN(n_dim=n_dim, e_dim=e_dim, config=cfg, position_encoder=position_encoder, use_cuda=use_cuda) classifier = MLP(cfg['F_DIM'], n_label, h_dims=cfg['MLP_DIMS'], dropout=cfg['DROPOUT'], activation='sigmoid') if use_cuda: model.cuda() classifier.cuda() params = list(chain(model.parameters(), classifier.parameters())) for param in params: print(param.shape) optimizer = optim.Adam(params, lr=cfg['LR'], weight_decay=cfg['DECAY']) current_lr = cfg['LR'] matrix_cache = MatrixCache(cfg['MAX_DICT']) loss_fuc = BCELoss() logs = [] def forward(mask: list, name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor): nfs = torch.cat([molecules[i].node_features for i in mask]) efs = torch.cat([molecules[i].edge_features for i in mask]) if use_cuda: nfs = nfs.cuda() efs = efs.cuda() us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name, use_cuda) embeddings, _ = model(nfs, efs, us, vs, mm_tuple, name, [smiles[i] for i in mask]) std_loss = 0 logits = classifier(embeddings) * not_nan[mask, :] target = properties[mask, :] if use_cuda: target = target.cuda() return logits, target, std_loss def train(mask_list: list, name=None): model.train() classifier.train() u_losses = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None optimizer.zero_grad() logits, target, std_loss = forward(m, name=name_) u_loss = loss_fuc(logits, target) u_losses.append(u_loss.cpu().item()) loss = u_loss + std_loss loss.backward() optimizer.step() nonlocal current_lr current_lr *= 1 - cfg['DECAY'] print('\t\tSemi-supervised loss: {:.4f}'.format(np.average(u_losses))) logs[-1].update({'on_train_loss': np.average(u_losses)}) def calc_masked_roc(logits, target, mask) -> float: rocs = [] for i in range(n_label): l = logits[:, i] t = target[:, i] # nnm = not_nan_mask[mask, i] # l = l[nnm == 1] # t = t[nnm == 1] rocs.append(roc_auc_score(t, l)) # print(l.shape) # print(rocs) return np.average(rocs) def evaluate(mask_list: list, name=None): model.eval() classifier.eval() losses = [] mask = [] logits_list = [] target_list = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None logits, target, _ = forward(m, name=name_) loss = loss_fuc(logits, target) losses.append(loss.cpu().item()) mask.extend(m) logits_list.append(logits.cpu().detach().numpy()) target_list.append(target.cpu().detach().numpy()) all_logits = np.vstack(logits_list) all_target = np.vstack(target_list) # print(all_logits[: 10]) # print(all_target[: 10]) roc = calc_masked_roc(all_logits, all_target, mask) print('\t\tLoss: {:.3f}'.format(np.average(losses))) print('\t\tROC: {:.3f}'.format(roc)) logs[-1].update({'{}_loss'.format(name): np.average(losses)}) logs[-1].update({'{}_metric'.format(name): roc}) for epoch in range(cfg['ITERATION']): logs.append({'epoch': epoch + 1}) print('In iteration {}:'.format(epoch + 1)) print('\tLearning rate: {:.8e}'.format(current_lr)) print('\tTraining: ') train(train_mask_list, name='train') print('\tEvaluating training: ') evaluate(train_mask_list, name='train') print('\tEvaluating validation: ') evaluate(validate_mask_list, name='evaluate') print('\tEvaluating test: ') evaluate(test_mask_list, name='test') gc.collect() d = {'metric': 'Multi-ROC', 'logs': logs} with open('{}{}.json'.format(LOG_PATH, tag), 'w+', encoding='utf-8') as fp: json.dump(d, fp)
def train_lipop(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, force_save=False, special_config: dict = None, position_encoder_path: str = 'net/pe.pt', tag='std', dataset='Lipop'): cfg = DEFAULT_CONFIG.copy() if special_config: cfg.update(special_config) for k, v in cfg.items(): print(k, ':', v) set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) if dataset == 'FreeSolv': smiles, info_list, properties = load_freesolv(limit, force_save=force_save) elif dataset == 'ESOL': smiles, info_list, properties = load_esol(limit, force_save=force_save) else: smiles, info_list, properties = load_lipop(limit, force_save=force_save) molecules = [ HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list ] n_dim = molecules[0].n_dim e_dim = molecules[0].e_dim node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), cfg['TRAIN_PER'], cfg['VALIDATE_PER'], cfg['TEST_PER']) n_seg = int(len(train_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(train_mask), n_seg) train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(validate_mask), n_seg) validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(test_mask) / (cfg['BATCH'] + 1)) n_seg = min(len(test_mask), n_seg) test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)] print(train_mask[0], validate_mask[0], test_mask[0]) print(len(train_mask_list), len(validate_mask_list), len(test_mask_list)) t_properties = properties[train_mask, :] prop_mean = np.mean(t_properties, axis=0) print('mean:', prop_mean) prop_std = np.std(t_properties.tolist(), axis=0, ddof=1) print('std:', prop_std) prop_mad = robust.mad(t_properties.tolist(), axis=0) print('mad:', prop_mad) norm_properties = (properties - prop_mean) / prop_std if position_encoder_path and os.path.exists(position_encoder_path): position_encoder = torch.load(position_encoder_path) position_encoder.eval() else: print('NO POSITION ENCODER IS BEING USED!!!') position_encoder = None model = AMPNN(n_dim=n_dim, e_dim=e_dim, config=cfg, position_encoder=position_encoder, use_cuda=use_cuda) regression = MLP(cfg['F_DIM'], 1, h_dims=cfg['MLP_DIMS'], dropout=cfg['DROPOUT']) if use_cuda: model.cuda() regression.cuda() for name, param in chain(model.named_parameters(), regression.named_parameters()): if param.requires_grad: print(name, ":", param.shape) optimizer = optim.Adam(filter( lambda x: x.requires_grad, chain(model.parameters(), regression.parameters())), lr=cfg['LR'], weight_decay=cfg['DECAY']) scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=cfg['GAMMA']) matrix_cache = MatrixCache(cfg['MAX_DICT']) loss_fuc = MSELoss() logs = [] def forward(mask: list, name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor): nfs = torch.cat([molecules[i].node_features for i in mask]) efs = torch.cat([molecules[i].edge_features for i in mask]) if use_cuda: nfs = nfs.cuda() efs = efs.cuda() us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name, use_cuda) embeddings, _ = model(nfs, efs, us, vs, mm_tuple, name, [smiles[i] for i in mask]) std_loss = 0 logits = regression(embeddings) target = norm_properties[mask, :] target = torch.tensor(target.astype(np.float32), dtype=torch.float32) if use_cuda: target = target.cuda() return logits, target, std_loss def train(mask_list: list, name=None): model.train() regression.train() u_losses = [] losses = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None optimizer.zero_grad() logits, target, std_loss = forward(m, name=name_) u_loss = loss_fuc(logits, target) u_losses.append(u_loss.cpu().item()) loss = u_loss + std_loss # loss.backward() # optimizer.step() losses.append(loss) if len(losses) >= cfg['PACK'] or i == len(mask_list) - 1: (sum(losses) / len(losses)).backward() optimizer.step() losses.clear() u_loss = np.average(u_losses) print('\t\tSemi-supervised loss: {:.4f}'.format(u_loss)) logs[-1].update({'on_train_loss': u_loss}) def evaluate(mask_list: list, name=None, visualize=None): model.eval() regression.eval() losses = [] masks = [] logits_list = [] target_list = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None logits, target, _ = forward(m, name=name_) loss = loss_fuc(logits, target) losses.append(loss.cpu().item()) if visualize: masks.extend(m) logits_list.append(logits.cpu().detach().numpy()) target_list.append(target.cpu().detach().numpy()) mse_loss = np.average(losses) * (prop_std[0]**2) rmse_loss = np.average([loss**0.5 for loss in losses]) * prop_std[0] print('\t\tMSE Loss: {:.3f}'.format(mse_loss)) print('\t\tRMSE Loss: {:.3f}'.format(rmse_loss)) logs[-1].update({'{}_loss'.format(name): mse_loss}) logs[-1].update({'{}_metric'.format(name): rmse_loss}) if visualize: all_logits = np.vstack(logits_list) all_target = np.vstack(target_list) best_ids, best_ds, worst_ids, worst_ds = \ plt_multiple_scatter(GRAPH_PATH + visualize, masks, all_logits, all_target) print('\t\tBest performance on:') for i, d in zip(best_ids, best_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) print('\t\tWorst performance on:') for i, d in zip(worst_ids, worst_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) for epoch in range(cfg['ITERATION']): logs.append({'epoch': epoch + 1}) scheduler.step(epoch=epoch) print('In iteration {}:'.format(epoch + 1)) print('\tTraining: ') train(train_mask_list, name='train') print('\tEvaluating training: ') evaluate( train_mask_list, name='train', # visualize='train_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None ) print('\tEvaluating validation: ') evaluate( validate_mask_list, name='evaluate', # visualize='val_{}'.format(epoch + 1) if (epoch + 1) % cfg['EVAL'] == 0 else None ) print('\tEvaluating test: ') evaluate( test_mask_list, name='test', # visualize='test' if epoch + 1 == cfg['ITERATION'] else None ) gc.collect() d = {'metric': 'RMSE', 'logs': logs} with open('{}{}.json'.format(LOG_PATH, tag), 'w+', encoding='utf-8') as fp: json.dump(d, fp)
def fit_qm9(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, force_save=False, special_config: dict = None, model_save_path: str = 'net/pe.pt', tag='std', mpnn_pos_encode=False, use_rdkit=False): t0 = time.time() cfg = DEFAULT_CONFIG.copy() if special_config: cfg.update(special_config) for k, v in cfg.items(): print(k, ':', v) set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) smiles, info_list, _ = load_qm9(limit, force_save=force_save) mol_atom_pos = load_mol_atom_pos(limit) with open('data/gdb9/incons.json') as fp: incons = json.load(fp) left = list(set(range(len(smiles))) - set(incons)) print('{} / {}'.format(len(left), len(smiles))) smiles = [smiles[i] for i in left] info_list = [info_list[i] for i in left] mol_atom_pos = [mol_atom_pos[i] for i in left] molecules = [HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list] n_dim = molecules[0].n_dim e_dim = molecules[0].e_dim node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), cfg['TRAIN_PER'], cfg['VALIDATE_PER'], cfg['TEST_PER']) n_seg = int(len(train_mask) / (cfg['BATCH'] + 1)) train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(validate_mask) / (cfg['BATCH'] + 1)) validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(test_mask) / (cfg['BATCH'] + 1)) test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)] print(train_mask[0], validate_mask[0], test_mask[0]) print(len(train_mask_list), len(validate_mask_list), len(test_mask_list)) model = PositionEncoder(n_dim=n_dim, e_dim=e_dim, config=cfg, use_cuda=use_cuda, use_mpnn=mpnn_pos_encode, use_rdkit=use_rdkit) if use_cuda: model.cuda() for name, param in model.named_parameters(): print(name, ":", param.shape) optimizer = optim.Adam(model.parameters(), lr=cfg['LR'], weight_decay=cfg['DECAY']) current_lr = cfg['LR'] matrix_cache = MatrixCache(cfg['MAX_DICT']) best_val = -1e8 logs = [] graph_logs = [] def visualize(smiles_list, pos: torch.Tensor, fit_pos: torch.Tensor, mol_node_matrix: torch.Tensor, vis=range(5)): if use_cuda: pos = pos.cpu() fit_pos = fit_pos.cpu() mol_node_matrix = mol_node_matrix.cpu() pos = pos.detach() fit_pos = fit_pos.detach() pos_list = [] for i in vis: node_mask = mol_node_matrix[i] > 0 pos_i = pos[node_mask == 1, :] fit_pos_i = fit_pos[node_mask == 1, :] new_pos_i, new_fit_pos_i = kabsch(pos_i, fit_pos_i, torch.full([1, pos_i.shape[0]], 1, dtype=torch.float32), use_cuda=False) pos_list.append({'smiles': smiles_list[i], 'src': new_pos_i.tolist(), 'tgt': new_fit_pos_i.tolist()}) # plt_molecule_3d(new_pos_i.numpy(), smiles_list[i], # title='fit_qm9_{}_{}_{}'.format(tag, epoch, i), d=GRAPH_PATH) # plt_molecule_3d(new_fit_pos_i.numpy(), smiles_list[i], # title='fit_qm9_origin_{}'.format(i), d=GRAPH_PATH) graph_logs[-1].update({'pos': pos_list}) def forward(mask: list, name=None): nfs = torch.cat([molecules[i].node_features for i in mask]) efs = torch.cat([molecules[i].edge_features for i in mask]) atom_pos = torch.cat([torch.from_numpy(mol_atom_pos[i]).type(torch.float32) for i in mask]) if use_cuda: nfs = nfs.cuda() efs = efs.cuda() atom_pos = atom_pos.cuda() us, vs, mm_tuple = matrix_cache.fetch(molecules, mask, nfs, name, use_cuda) mask_smiles = [smiles[i] for i in mask] adj3_loss, dis_loss, rmsd_loss, s_loss, c_loss, pos = model.fit(nfs, efs, mask_smiles, us, vs, mm_tuple, atom_pos, print_mode=name == 'test0') if name == 'test0': visualize([smiles[i] for i in mask], pos, atom_pos, mm_tuple[0]) return adj3_loss, dis_loss, rmsd_loss, s_loss, c_loss def train(mask_list: list, name=None): model.train() a_losses = [] d_losses = [] r_losses = [] s_losses = [] c_losses = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None optimizer.zero_grad() a_loss, d_loss, r_loss, s_loss, c_loss = forward(m, name=name_) a_losses.append(a_loss.cpu().item() if 'cpu' in dir(a_loss) else a_loss) d_losses.append(d_loss.cpu().item() if 'cpu' in dir(d_loss) else d_loss) r_losses.append(r_loss.cpu().item() if 'cpu' in dir(r_loss) else r_loss) s_losses.append(s_loss.cpu().item() if 'cpu' in dir(s_loss) else s_loss) c_losses.append(c_loss.cpu().item() if 'cpu' in dir(c_loss) else c_loss) # loss = a_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss # loss = d_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss loss = r_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss + cfg['GAMMA_A'] * a_loss loss.backward() optimizer.step() nonlocal current_lr current_lr *= 1 - cfg['DECAY'] print('\t\tADJ3 loss: {:.4f}'.format(np.average(a_losses))) print('\t\tDistance loss: {:.4f}'.format(np.average(d_losses))) print('\t\tRMSD metric: {:.4f}'.format(np.average(r_losses))) print('\t\tStationary loss: {:.4f}'.format(np.average(s_losses))) print('\t\tCentrality loss: {:.4f}'.format(np.average(c_losses))) logs[-1].update({'on_train_loss': np.average(a_losses)}) def evaluate(mask_list: list, name=None): model.eval() losses = [] a_losses = [] d_losses = [] r_losses = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None a_loss, d_loss, r_loss, s_loss, c_loss = forward(m, name=name_) # loss = a_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss # loss = d_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss loss = r_loss + cfg['GAMMA_S'] * s_loss + cfg['GAMMA_C'] * c_loss + cfg['GAMMA_A'] * a_loss losses.append(loss.cpu().item() if 'cpu' in dir(loss) else loss) a_losses.append(a_loss.cpu().item() if 'cpu' in dir(a_loss) else a_loss) d_losses.append(d_loss.cpu().item() if 'cpu' in dir(d_loss) else d_loss) r_losses.append(r_loss.cpu().item() if 'cpu' in dir(r_loss) else r_loss) if name == 'evaluate': val = -np.average(losses) nonlocal best_val if val > best_val: print('\t\tSaving position encoder...') torch.save(model, model_save_path) best_val = val print('\t\tSaving finished!') print('\t\tLoss: {:.5f}'.format(np.average(losses))) print('\t\tADJ3 loss: {:.5f}'.format(np.average(a_losses))) print('\t\tDistance loss: {:.5f}'.format(np.average(d_losses))) print('\t\tRMSD metric: {:.5f}'.format(np.average(r_losses))) logs[-1].update({'{}_loss'.format(name): np.average(losses)}) logs[-1].update({'{}_adj3_metric'.format(name): np.average(a_losses)}) logs[-1].update({'{}_distance_metric'.format(name): np.average(d_losses)}) logs[-1].update({'{}_rmsd_metric'.format(name): np.average(r_losses)}) for epoch in range(cfg['ITERATION']): logs.append({'epoch': epoch + 1}) graph_logs.append({'epoch': epoch + 1}) print('In iteration {}:'.format(epoch + 1)) print('\tLearning rate: {:.8e}'.format(current_lr)) if not use_rdkit: print('\tTraining: ') train(train_mask_list, name='train') print('\tEvaluating training: ') evaluate(train_mask_list, name='train') print('\tEvaluating validation: ') evaluate(validate_mask_list, name='evaluate') print('\tEvaluating test: ') evaluate(test_mask_list, name='test') gc.collect() d = {'metric': 'Distance Loss', 'time': time.time() - t0, 'logs': logs} with open('{}{}.json'.format(LOG_PATH, tag), 'w+', encoding='utf-8') as fp: json.dump(d, fp) gd = graph_logs with open('{}{}.json'.format(GRAPH_PATH, tag), 'w+', encoding='utf-8') as fp: json.dump(gd, fp)
def train_gdb9(seed: int = 19700101, limit: int = -1, residual: bool = True, use_cuda: bool = False, prop: list = [9]): set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) molecule_set, properties = load_gdb9(limit) properties = properties[:, prop] molecules, n_dim, e_dim = encode_molecules(molecule_set) hidden_dims = [H_DIM] * len(C_DIMS) node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), TRAIN_PER, VALIDATE_PER, TEST_PER) print(train_mask, validate_mask, test_mask) t_properties = properties[train_mask, :] prop_mean = np.mean(t_properties, axis=0) print('mean:', prop_mean) prop_std = np.std(t_properties.tolist(), axis=0, ddof=1) print('std:', prop_std) prop_mad = robust.mad(t_properties.tolist(), axis=0) print('mad:', prop_mad) ratio = (prop_std / prop_mad)**2 norm_properties = (properties - prop_mean) / prop_std model = AMPNN(n_dim, e_dim, H_DIM, C_DIMS, HE_DIM, HEAD_NUM, len(hidden_dims), residual=residual, use_cuda=use_cuda, dropout=DROPOUT) if residual: r_dim = (n_dim + sum(hidden_dims)) else: r_dim = hidden_dims[-1] regression = MLP(int(r_dim * HEAD_NUM), len(prop), [MLP_HIDDEN], dropout=DROPOUT) if use_cuda: model.cuda() regression.cuda() params = list(chain(model.parameters(), regression.parameters())) for param in params: print(param.shape) optimizer = optim.Adam(params, lr=LR, weight_decay=DECAY) loss_fuc = MSELoss() # forward_time = 0. bp_time = 0. def forward(mask: list, show_attention_cnt=0) -> (torch.Tensor, torch.Tensor, list): as_ = [] embeddings = [] target = norm_properties[mask, :] for i in mask: hg = molecules[i] if use_cuda: embedding, a = model(hg.node_features.cuda(), hg.edge_features.cuda(), hg.us, hg.vs, hg.edge_mask, GLOBAL_MASK) else: embedding, a = model(hg.node_features, hg.edge_features, hg.us, hg.vs, hg.edge_mask, GLOBAL_MASK) embeddings.append(embedding) as_.append(a) if show_attention_cnt: print('### For molecule {} ###'.format(i)) molecule_set.molecules[i].show() if use_cuda: a = a.cpu() print(a.detach().numpy()) show_attention_cnt -= 1 embeddings = torch.stack(embeddings) target = torch.tensor(target.astype(np.float32), dtype=torch.float32) if use_cuda: embeddings = embeddings.cuda() target = target.cuda() logits = regression(embeddings) return logits, target, as_ def calc_normalized_loss(logits, target): losses = [] for i in range(len(prop)): losses.append(loss_fuc(logits[:, i], target[:, i]) * ratio[i]) return sum(losses) t_losses = [] v_losses = [] t_maes = [] v_maes = [] for epoch in range(ITERATION): optimizer.zero_grad() if len(train_mask) > TRN_BATCH: temp_train_mask = np.random.permutation(train_mask)[:TRN_BATCH] else: temp_train_mask = train_mask if len(validate_mask) > VAL_BATCH: temp_validate_mask = np.random.permutation( validate_mask)[:VAL_BATCH] else: temp_validate_mask = validate_mask # t1 = time.time() t_logits, t_target, tas = forward(temp_train_mask) v_logits, v_target, vas = forward(temp_validate_mask) # forward_time += time.time() - t1 t_loss = calc_normalized_loss(t_logits, t_target) # tas_loss = 0.0 * t_loss.cpu().item() * sum([as_.sum(1).norm() for as_ in tas]) / len(tas) # total_loss = t_loss + tas_loss v_loss = calc_normalized_loss(v_logits, v_target) t_mae = torch.abs(t_logits - t_target).mean(dim=0) v_mae = torch.abs(v_logits - v_target).mean(dim=0) t_losses.append(t_loss.cpu().item()) v_losses.append(v_loss.cpu().item()) t_maes.append(t_mae.cpu().detach().numpy()) v_maes.append(v_mae.cpu().detach().numpy()) if (epoch + 1) % EVAL == 0: print( 'In iteration {}, training: {:.3f}; validation: {:.3f}'.format( epoch, np.average(t_losses[-EVAL:]), np.average(v_losses[-EVAL:]))) print('\tFor training: {}.'.format( np.average(t_maes[-EVAL:], axis=0) * prop_std)) print('\tFor validation: {}.'.format( np.average(v_maes[-EVAL:], axis=0) * prop_std)) # print('\tBias: {}.'.format(regression.linear1.bias.cpu().detach().numpy())) # print(tas_loss.cpu().item()) t1 = time.time() t_loss.backward() optimizer.step() bp_time += time.time() - t1 if len(test_mask) > TST_BATCH: temp_test_mask = np.random.permutation(test_mask)[:TST_BATCH] else: temp_test_mask = test_mask e_logits, e_target, eas = forward(temp_test_mask, show_attention_cnt=10) print(e_logits.cpu().detach().numpy() * prop_std + prop_mean) print(e_target.cpu().detach().numpy() * prop_std + prop_mean) e_loss = calc_normalized_loss(e_logits, e_target) print('target MSE:', e_loss.cpu().item()) e_mae = torch.abs(e_logits - e_target).mean(dim=0) print('target MAE:', e_mae.cpu().detach().numpy() * prop_std) # print(forward_time) print(bp_time) print(model.total_forward_time) print(model.layer_forward_time)
def train_hiv(seed: int = 19700101, limit: int = -1, use_cuda: bool = True, use_tqdm=True, use_model='HamGN', dataset='HIV'): set_seed(seed, use_cuda) np.set_printoptions(precision=4, suppress=True, linewidth=140) if dataset == 'HIV': smiles, info_list, properties = load_hiv(limit) graph_path = HIV_GRAPH_PATH default_config = HIVConfig elif dataset == 'BBBP': smiles, info_list, properties = load_bbbp(limit) graph_path = BBBP_GRAPH_PATH default_config = BBBPConfig else: assert False, "Unknown dataset: {}.".format(dataset) n_label = properties.max() + 1 properties = torch.tensor(properties, dtype=torch.int64) if use_cuda: properties = properties.cuda() molecules = [ HeteroGraph(info['nf'], info['ef'], info['us'], info['vs'], info['em']) for info in info_list ] n_dim = molecules[0].n_dim e_dim = molecules[0].e_dim node_num = len(molecules) train_mask, validate_mask, test_mask = sample(list(range(node_num)), default_config.TRAIN_PER, default_config.VALIDATE_PER, default_config.TEST_PER) n_seg = int(len(train_mask) / (default_config.BATCH + 1)) train_mask_list = [train_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(validate_mask) / (default_config.BATCH + 1)) validate_mask_list = [validate_mask[i::n_seg] for i in range(n_seg)] n_seg = int(len(test_mask) / (default_config.BATCH + 1)) test_mask_list = [test_mask[i::n_seg] for i in range(n_seg)] print(train_mask, validate_mask, test_mask) print(len(train_mask_list), len(validate_mask_list), len(test_mask_list)) if use_model == 'HamGN': model = DynamicGraphEncoder(n_dim=n_dim, e_dim=e_dim, default_config=default_config, use_cuda=use_cuda) elif use_model == 'AMPNN': model = AMPNN(n_dim=n_dim, e_dim=e_dim, default_config=default_config, use_cuda=use_cuda) else: assert False, 'Undefined model: {}!'.format(use_model) classifier = MLP(default_config.F_DIM, n_label, h_dims=default_config.H_DIMS, dropout=default_config.DROPOUT, activation='softmax') if use_cuda: model.cuda() classifier.cuda() params = list(chain(model.parameters(), classifier.parameters())) for param in params: print(param.shape) optimizer = optim.Adam(params, lr=default_config.LR, weight_decay=default_config.DECAY) current_lr = default_config.LR loss_fuc = CrossEntropyLoss() # forward_time = 0. matrix_mask_dicts = {} s_losses = [] c_losses = [] a_losses = [] u_losses = [] def forward(mask: list, name=None) -> (torch.Tensor, torch.Tensor, torch.Tensor): nfs = torch.cat([molecules[i].node_features for i in mask]) efs = torch.cat([molecules[i].edge_features for i in mask]) if use_cuda: nfs = nfs.cuda() efs = efs.cuda() ms = [] us = [] vs = [] em = [] ptr = 0 for i, m in enumerate(mask): nn = molecules[m].node_features.shape[0] ms.extend([i] * nn) for u in molecules[m].us: us.append(u + ptr) for v in molecules[m].vs: vs.append(v + ptr) em.extend(molecules[m].edge_mask) ptr += nn if name and name in matrix_mask_dicts.keys(): mm_tuple = matrix_mask_dicts[name] else: n_node = nfs.shape[0] mol_node_matrix, mol_node_mask = \ AMPNN.produce_node_edge_matrix(max(ms) + 1, ms, ms, [1] * len(ms)) node_edge_matrix_global, node_edge_mask_global = \ AMPNN.produce_node_edge_matrix(n_node, us, vs, [1] * len(us)) if use_cuda: mol_node_matrix = mol_node_matrix.cuda() mol_node_mask = mol_node_mask.cuda() node_edge_matrix_global = node_edge_matrix_global.cuda() node_edge_mask_global = node_edge_mask_global.cuda() mm_tuple = ( mol_node_matrix, mol_node_mask, node_edge_matrix_global, node_edge_mask_global, ) if name and len( matrix_mask_dicts.keys()) < default_config.MAX_DICT: matrix_mask_dicts[name] = mm_tuple if use_model == 'HamGN': embeddings, s_loss, c_loss, a_loss = model(nfs, efs, us, vs, mm_tuple) # if np.random.randint(0, 1000) == 0: # print(embeddings.cpu().detach().numpy()) s_losses.append(s_loss.cpu().item()) c_losses.append(c_loss.cpu().item()) a_losses.append(a_loss.cpu().item()) std_loss = default_config.GAMMA_S * s_loss + \ default_config.GAMMA_C * c_loss + \ default_config.GAMMA_A * a_loss elif use_model == 'AMPNN': embeddings, _ = model(nfs, efs, us, vs, mm_tuple) std_loss = 0 else: assert False logits = classifier(embeddings) # print(logits.cpu()) target = properties[mask] if use_cuda: target = target.cuda() return logits, target, std_loss def train(mask_list: list, name=None): model.train() classifier.train() s_losses.clear() c_losses.clear() a_losses.clear() u_losses.clear() t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None optimizer.zero_grad() logits, target, std_loss = forward(m, name=name_) u_loss = loss_fuc(logits, target) u_losses.append(u_loss.cpu().item()) loss = u_loss + std_loss loss.backward() optimizer.step() nonlocal current_lr current_lr *= 1 - default_config.DECAY if use_model == 'HamGN': print('\t\tStationary loss: {:.4f}'.format(np.average(s_losses))) print('\t\tCentrality loss: {:.4f}'.format(np.average(c_losses))) print('\t\tAffinity loss: {:.4f}'.format(np.average(a_losses))) print('\t\tSemi-supervised loss: {:.4f}'.format(np.average(u_losses))) def evaluate(mask_list: list, name=None, visualize=None): model.eval() classifier.eval() losses = [] masks = [] logits_list = [] target_list = [] t = enumerate(mask_list) if use_tqdm: t = tqdm(t, total=len(mask_list)) for i, m in t: if name: name_ = name + str(i) else: name_ = None logits, target, _ = forward(m, name=name_) loss = loss_fuc(logits, target) losses.append(loss.cpu().item()) logits_list.append(logits.cpu().detach().numpy()) target_list.append(target.cpu().detach().numpy()) if visualize: masks.extend(m) all_logits = np.vstack(logits_list) all_target = np.concatenate(target_list) print('\t\tLoss: {:.3f}'.format(np.average(losses))) print('\t\tROC: {:.3f}'.format( roc_auc_score(all_target, all_logits[:, 1]))) if visualize: best_ids, best_ds, worst_ids, worst_ds = \ plt_multiple_scatter(graph_path + visualize, masks, all_logits, all_target) print('\t\tBest performance on:') for i, d in zip(best_ids, best_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) print('\t\tWorst performance on:') for i, d in zip(worst_ids, worst_ds): print('\t\t\t{}: {}'.format(smiles[i], d)) for epoch in range(default_config.ITERATION): print('In iteration {}:'.format(epoch + 1)) print('\tLearning rate: {:.8e}'.format(current_lr)) print('\tTraining: ') train(train_mask_list, name='train') print('\tEvaluating training: ') evaluate(train_mask_list, name='train') print('\tEvaluating validation: ') evaluate(validate_mask_list, name='evaluate') print('\tEvaluating test: ') evaluate(test_mask_list) gc.collect()