def get_class_sizes(data: MoleculeDataset) -> List[List[float]]: """ Determines the proportions of the different classes in the classification dataset. :param data: A classification dataset :return: A list of lists of class proportions. Each inner list contains the class proportions for a task. """ targets = data.targets() # Filter out Nones valid_targets = [[] for _ in range(data.num_tasks())] for i in range(len(targets)): for task_num in range(len(targets[i])): if targets[i][task_num] is not None: valid_targets[task_num].append(targets[i][task_num]) class_sizes = [] for task_targets in valid_targets: # Make sure we're dealing with a binary classification task assert set(np.unique(task_targets)) <= {0, 1} try: ones = np.count_nonzero(task_targets) / len(task_targets) except ZeroDivisionError: ones = float('nan') print('Warning: class has no targets') class_sizes.append([1 - ones, ones]) return class_sizes
def get_embedding(self, data): self.net.eval() embedding = [] num_iters, iter_step = len(data), self.args.batch_size for i in range(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + self.args.batch_size]) smiles_batch, features_batch = mol_batch.smiles( ), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): preds, latent = self.net(batch, features_batch) latent = latent.data.cpu().numpy() # Collect vectors latent = latent.tolist() embedding.extend(latent) return np.array(embedding)
def predict_prob_dropout_split(self, data, scaler=None): self.net.train() preds = [] num_iters, iter_step = len(data), self.args.batch_size for i in range(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + self.args.batch_size]) smiles_batch, features_batch = mol_batch.smiles( ), mol_batch.features() # Run model batch = smiles_batch batch_preds = [] with torch.no_grad(): for i in range(self.n_drop): batch_pred, e = self.net(batch, features_batch) batch_preds.append(batch_pred.data.cpu().numpy()) # Inverse scale if regression if scaler is not None: batch_preds = scaler.inverse_transform(batch_preds) # Collect vectors batch_preds = np.hstack(batch_preds).tolist() preds.extend(batch_preds) return np.array(preds)
def predict(self, data, scaler=None): self.net.eval() preds = [] num_iters, iter_step = len(data), self.args.batch_size for i in range(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + self.args.batch_size]) smiles_batch, features_batch = mol_batch.smiles( ), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): batch_preds, e = self.net(batch, features_batch) batch_preds = batch_preds.data.cpu().numpy() # Inverse scale if regression if scaler is not None: batch_preds = scaler.inverse_transform(batch_preds) # Collect vectors batch_preds = batch_preds.tolist() preds.extend(batch_preds) return preds
def get_data_from_smiles(smiles: List[str], skip_invalid_smiles: bool = True, logger: Logger = None) -> MoleculeDataset: """ Converts SMILES to a MoleculeDataset. :param smiles: A list of SMILES strings. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param logger: Logger. :return: A MoleculeDataset with all of the provided SMILES. """ debug = logger.debug if logger is not None else print data = MoleculeDataset([MoleculeDatapoint([smile]) for smile in smiles]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) return data
def filter_invalid_smiles(data: MoleculeDataset) -> MoleculeDataset: """ Filters out invalid SMILES. :param data: A MoleculeDataset. :return: A MoleculeDataset with only valid molecules. """ return MoleculeDataset([ datapoint for datapoint in data if datapoint.smiles != '' and datapoint.mol is not None and datapoint.mol.GetNumHeavyAtoms() > 0 ])
def train(self, n_iter, n_epoch=None): if n_epoch is None: n_epoch = self.args.epoch idxs_train = np.arange(self.n_pool)[self.idxs_lb] data = MoleculeDataset(self.data[idxs_train]) for epoch in range(1, n_epoch + 1): n_iter = self._train(epoch, data, n_iter) return n_iter
def query(self, n): idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb] if self.args.data_pool is not None: idxs_unlabeled = np.random.choice(idxs_unlabeled, self.args.data_pool, replace=False) mol_unlabeled = MoleculeDataset(self.data[idxs_unlabeled]) preds = self.predict_prob_dropout_split(mol_unlabeled) pred_var = torch.Tensor(preds.var(1)) return idxs_unlabeled[pred_var.sort()[1][:n]]
def query(self, n): idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb] if self.args.data_pool is not None: idxs_unlabeled = np.random.choice(idxs_unlabeled, self.args.data_pool, replace=False) embedding = self.get_embedding( MoleculeDataset(self.data[idxs_unlabeled])) def distij(i, j, data=embedding): return sum( np.sqrt(np.square(np.array(data[i]) - np.array(data[j])))) picker = MaxMinPicker() pickIndices = picker.LazyPick(distij, embedding.shape[0], n) return idxs_unlabeled[pickIndices]
def query(self, n): idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb] if self.args.data_pool is not None: idxs_unlabeled = np.random.choice(idxs_unlabeled, self.args.data_pool, replace=False) embedding = self.get_embedding( MoleculeDataset(self.data[idxs_unlabeled])) cluster_learner = KMeans(n_clusters=n) cluster_learner.fit(embedding) cluster_idxs = cluster_learner.predict(embedding) centers = cluster_learner.cluster_centers_[cluster_idxs] dis = (embedding - centers)**2 dis = dis.sum(axis=1) q_idxs = np.array([ np.arange(embedding.shape[0])[cluster_idxs == i][dis[ cluster_idxs == i].argmin()] for i in range(n) ]) return idxs_unlabeled[q_idxs]
def get_data(path: str = None, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = None, logger: Logger = None) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :param logger: Logger. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if path is None: path = args.init_data path_pool = args.pool_data else: path_pool = None if args is not None: # Prefer explicit function arguments but default to args if not provided features_path = features_path if features_path is not None else args.features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names else: use_compound_names = False max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append( load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() # Load data with open(path) as f: reader = csv.reader(f) header = next(reader) target_idx = header.index(args.mol_prop) idxs = [0, target_idx] lines = [] for line in reader: smiles = line[0] if smiles in skip_smiles: continue lines.append(list(np.array(line)[idxs])) if len(lines) >= max_data_size: break if path_pool is not None: with open(path_pool) as f: reader = csv.reader(f) next(reader) for line in reader: smiles = line[0] if smiles in skip_smiles: continue lines.append([smiles, 0]) if len(lines) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint( line=line, args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) if data.data[0].features is not None: args.features_dim = len(data.data[0].features) return data
def _train(self, epoch: int, data: Union[MoleculeDataset, List[MoleculeDataset]], n_iter: int) -> int: """ Trains a model for an epoch. """ debug = self.logger.debug if self.logger is not None else print debug(f'Running epoch: {epoch}') self.net.train() data.shuffle() loss_sum, iter_count = 0, 0 num_iters = len(data) // self.args.batch_size * self.args.batch_size iter_size = self.args.batch_size for i in trange(0, num_iters, iter_size): # Prepare batch if i + self.args.batch_size > len(data): break mol_batch = MoleculeDataset(data[i:i + self.args.batch_size]) smiles_batch, features_batch, target_batch = mol_batch.smiles( ), mol_batch.features(), mol_batch.targets() batch = smiles_batch mask = torch.Tensor([[x is not None for x in tb] for tb in target_batch]) targets = torch.Tensor([[0 if x is None else x for x in tb] for tb in target_batch]) if next(self.net.parameters()).is_cuda: mask, targets = mask.cuda(), targets.cuda() class_weights = torch.ones(targets.shape) if self.use_cuda: class_weights = class_weights.cuda() # Run model self.net.zero_grad() preds, e = self.net(batch, features_batch) loss = self.loss_func(preds, targets) * class_weights * mask loss = loss.sum() / mask.sum() loss_sum += loss.item() iter_count += len(mol_batch) loss.backward() self.optimizer.step() if (n_iter // self.args.batch_size ) % self.args.learning_rate_decay_steps == 0: self.lr_schedule.step() n_iter += len(mol_batch) # Log and/or add to tensorboard if (n_iter // self.args.batch_size) % self.args.log_frequency == 0: lrs = self.lr_schedule.get_lr() loss_avg = loss_sum / iter_count loss_sum, iter_count = 0, 0 lrs_str = ', '.join(f'lr_{i} = {lr:.4e}' for i, lr in enumerate(lrs)) debug(f'Loss = {loss_avg:.4e}, {lrs_str}') if self.writer is not None: self.writer.add_scalar('train_loss', loss_avg, n_iter) # for i, lr in enumerate(lrs): # self.writer.add_scalar(f'learning_rate_{i}', lr, n_iter) return n_iter