Exemple #1
0
def get_class_sizes(data: MoleculeDataset) -> List[List[float]]:
    """
    Determines the proportions of the different classes in the classification dataset.
    :param data: A classification dataset
    :return: A list of lists of class proportions. Each inner list contains the class proportions
    for a task.
    """
    targets = data.targets()

    # Filter out Nones
    valid_targets = [[] for _ in range(data.num_tasks())]
    for i in range(len(targets)):
        for task_num in range(len(targets[i])):
            if targets[i][task_num] is not None:
                valid_targets[task_num].append(targets[i][task_num])

    class_sizes = []
    for task_targets in valid_targets:
        # Make sure we're dealing with a binary classification task
        assert set(np.unique(task_targets)) <= {0, 1}

        try:
            ones = np.count_nonzero(task_targets) / len(task_targets)
        except ZeroDivisionError:
            ones = float('nan')
            print('Warning: class has no targets')
        class_sizes.append([1 - ones, ones])

    return class_sizes
Exemple #2
0
    def get_embedding(self, data):
        self.net.eval()

        embedding = []

        num_iters, iter_step = len(data), self.args.batch_size

        for i in range(0, num_iters, iter_step):
            # Prepare batch
            mol_batch = MoleculeDataset(data[i:i + self.args.batch_size])
            smiles_batch, features_batch = mol_batch.smiles(
            ), mol_batch.features()

            # Run model
            batch = smiles_batch

            with torch.no_grad():
                preds, latent = self.net(batch, features_batch)

            latent = latent.data.cpu().numpy()

            # Collect vectors
            latent = latent.tolist()
            embedding.extend(latent)

        return np.array(embedding)
Exemple #3
0
    def predict_prob_dropout_split(self, data, scaler=None):
        self.net.train()

        preds = []

        num_iters, iter_step = len(data), self.args.batch_size

        for i in range(0, num_iters, iter_step):
            # Prepare batch
            mol_batch = MoleculeDataset(data[i:i + self.args.batch_size])
            smiles_batch, features_batch = mol_batch.smiles(
            ), mol_batch.features()

            # Run model
            batch = smiles_batch

            batch_preds = []
            with torch.no_grad():
                for i in range(self.n_drop):
                    batch_pred, e = self.net(batch, features_batch)
                    batch_preds.append(batch_pred.data.cpu().numpy())

            # Inverse scale if regression
            if scaler is not None:
                batch_preds = scaler.inverse_transform(batch_preds)

            # Collect vectors
            batch_preds = np.hstack(batch_preds).tolist()
            preds.extend(batch_preds)

        return np.array(preds)
Exemple #4
0
    def predict(self, data, scaler=None):
        self.net.eval()

        preds = []

        num_iters, iter_step = len(data), self.args.batch_size

        for i in range(0, num_iters, iter_step):
            # Prepare batch
            mol_batch = MoleculeDataset(data[i:i + self.args.batch_size])
            smiles_batch, features_batch = mol_batch.smiles(
            ), mol_batch.features()

            # Run model
            batch = smiles_batch

            with torch.no_grad():
                batch_preds, e = self.net(batch, features_batch)

            batch_preds = batch_preds.data.cpu().numpy()

            # Inverse scale if regression
            if scaler is not None:
                batch_preds = scaler.inverse_transform(batch_preds)

            # Collect vectors
            batch_preds = batch_preds.tolist()
            preds.extend(batch_preds)

        return preds
Exemple #5
0
def get_data_from_smiles(smiles: List[str],
                         skip_invalid_smiles: bool = True,
                         logger: Logger = None) -> MoleculeDataset:
    """
    Converts SMILES to a MoleculeDataset.
    :param smiles: A list of SMILES strings.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param logger: Logger.
    :return: A MoleculeDataset with all of the provided SMILES.
    """
    debug = logger.debug if logger is not None else print

    data = MoleculeDataset([MoleculeDatapoint([smile]) for smile in smiles])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    return data
Exemple #6
0
def filter_invalid_smiles(data: MoleculeDataset) -> MoleculeDataset:
    """
    Filters out invalid SMILES.
    :param data: A MoleculeDataset.
    :return: A MoleculeDataset with only valid molecules.
    """
    return MoleculeDataset([
        datapoint for datapoint in data if datapoint.smiles != ''
        and datapoint.mol is not None and datapoint.mol.GetNumHeavyAtoms() > 0
    ])
Exemple #7
0
    def train(self, n_iter, n_epoch=None):
        if n_epoch is None:
            n_epoch = self.args.epoch

        idxs_train = np.arange(self.n_pool)[self.idxs_lb]
        data = MoleculeDataset(self.data[idxs_train])

        for epoch in range(1, n_epoch + 1):
            n_iter = self._train(epoch, data, n_iter)

        return n_iter
Exemple #8
0
    def query(self, n):
        idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
        if self.args.data_pool is not None:
            idxs_unlabeled = np.random.choice(idxs_unlabeled,
                                              self.args.data_pool,
                                              replace=False)

        mol_unlabeled = MoleculeDataset(self.data[idxs_unlabeled])
        preds = self.predict_prob_dropout_split(mol_unlabeled)
        pred_var = torch.Tensor(preds.var(1))
        return idxs_unlabeled[pred_var.sort()[1][:n]]
Exemple #9
0
    def query(self, n):
        idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
        if self.args.data_pool is not None:
            idxs_unlabeled = np.random.choice(idxs_unlabeled,
                                              self.args.data_pool,
                                              replace=False)

        embedding = self.get_embedding(
            MoleculeDataset(self.data[idxs_unlabeled]))

        def distij(i, j, data=embedding):
            return sum(
                np.sqrt(np.square(np.array(data[i]) - np.array(data[j]))))

        picker = MaxMinPicker()
        pickIndices = picker.LazyPick(distij, embedding.shape[0], n)

        return idxs_unlabeled[pickIndices]
    def query(self, n):
        idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
        if self.args.data_pool is not None:
            idxs_unlabeled = np.random.choice(idxs_unlabeled,
                                              self.args.data_pool,
                                              replace=False)

        embedding = self.get_embedding(
            MoleculeDataset(self.data[idxs_unlabeled]))

        cluster_learner = KMeans(n_clusters=n)
        cluster_learner.fit(embedding)

        cluster_idxs = cluster_learner.predict(embedding)
        centers = cluster_learner.cluster_centers_[cluster_idxs]
        dis = (embedding - centers)**2
        dis = dis.sum(axis=1)
        q_idxs = np.array([
            np.arange(embedding.shape[0])[cluster_idxs == i][dis[
                cluster_idxs == i].argmin()] for i in range(n)
        ])

        return idxs_unlabeled[q_idxs]
Exemple #11
0
def get_data(path: str = None,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = None,
             logger: Logger = None) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.
    :param path: Path to a CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features. If provided, it is used
    in place of args.features_path.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :param logger: Logger.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if path is None:
        path = args.init_data
        path_pool = args.pool_data
    else:
        path_pool = None

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        features_path = features_path if features_path is not None else args.features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
    else:
        use_compound_names = False

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(
                load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.reader(f)
        header = next(reader)
        target_idx = header.index(args.mol_prop)
        idxs = [0, target_idx]

        lines = []
        for line in reader:
            smiles = line[0]

            if smiles in skip_smiles:
                continue

            lines.append(list(np.array(line)[idxs]))

            if len(lines) >= max_data_size:
                break

    if path_pool is not None:
        with open(path_pool) as f:
            reader = csv.reader(f)
            next(reader)

            for line in reader:
                smiles = line[0]

                if smiles in skip_smiles:
                    continue

                lines.append([smiles, 0])

                if len(lines) >= max_data_size:
                    break

    data = MoleculeDataset([
        MoleculeDatapoint(
            line=line,
            args=args,
            features=features_data[i] if features_data is not None else None,
            use_compound_names=use_compound_names)
        for i, line in tqdm(enumerate(lines), total=len(lines))
    ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    if data.data[0].features is not None:
        args.features_dim = len(data.data[0].features)

    return data
Exemple #12
0
    def _train(self, epoch: int, data: Union[MoleculeDataset,
                                             List[MoleculeDataset]],
               n_iter: int) -> int:
        """
        Trains a model for an epoch.
        """
        debug = self.logger.debug if self.logger is not None else print

        debug(f'Running epoch: {epoch}')

        self.net.train()

        data.shuffle()
        loss_sum, iter_count = 0, 0
        num_iters = len(data) // self.args.batch_size * self.args.batch_size
        iter_size = self.args.batch_size

        for i in trange(0, num_iters, iter_size):
            # Prepare batch
            if i + self.args.batch_size > len(data):
                break
            mol_batch = MoleculeDataset(data[i:i + self.args.batch_size])
            smiles_batch, features_batch, target_batch = mol_batch.smiles(
            ), mol_batch.features(), mol_batch.targets()
            batch = smiles_batch
            mask = torch.Tensor([[x is not None for x in tb]
                                 for tb in target_batch])
            targets = torch.Tensor([[0 if x is None else x for x in tb]
                                    for tb in target_batch])

            if next(self.net.parameters()).is_cuda:
                mask, targets = mask.cuda(), targets.cuda()

            class_weights = torch.ones(targets.shape)

            if self.use_cuda:
                class_weights = class_weights.cuda()

            # Run model
            self.net.zero_grad()
            preds, e = self.net(batch, features_batch)

            loss = self.loss_func(preds, targets) * class_weights * mask
            loss = loss.sum() / mask.sum()

            loss_sum += loss.item()
            iter_count += len(mol_batch)

            loss.backward()
            self.optimizer.step()

            if (n_iter // self.args.batch_size
                ) % self.args.learning_rate_decay_steps == 0:
                self.lr_schedule.step()

            n_iter += len(mol_batch)

            # Log and/or add to tensorboard
            if (n_iter // self.args.batch_size) % self.args.log_frequency == 0:
                lrs = self.lr_schedule.get_lr()
                loss_avg = loss_sum / iter_count
                loss_sum, iter_count = 0, 0

                lrs_str = ', '.join(f'lr_{i} = {lr:.4e}'
                                    for i, lr in enumerate(lrs))
                debug(f'Loss = {loss_avg:.4e}, {lrs_str}')

                if self.writer is not None:
                    self.writer.add_scalar('train_loss', loss_avg, n_iter)
                    # for i, lr in enumerate(lrs):
                    #     self.writer.add_scalar(f'learning_rate_{i}', lr, n_iter)

        return n_iter