Example #1
0
def _cache(data: TextIO, model_name: Text, output: BinaryIO, **kwargs):
    cpu = require_device(prefer_cuda=False)
    model_type = models.select(model_name)
    model = ModelInterface(model_type, cpu, False)

    csv = util.load_csv(data)
    cache = {}
    for smiles in csv.keys():
        cache_key = (smiles, )  # memcached is indexed on argument list
        data = model.process(smiles)
        cache[cache_key] = model.encode_data(data, **kwargs)

    pickle.dump(cache, output)
Example #2
0
def train(directory: Text,
          model_name: Text,
          batch_size: int,
          learning_rate: float,
          epsilon: float,
          cuda: bool,
          train_with_test: bool,
          min_iteration: int,
          max_iteration: int,
          ndrop: Optional[float] = None,
          **kwargs) -> None:
    # filter out options that are not set in command line
    kwargs = util.dict_filter(kwargs, lambda k, v: v is not None)

    data_folder = Path(directory)
    assert data_folder.is_dir(), 'Invalid data folder'

    dev = require_device(cuda)
    for fold in sorted(data_folder.iterdir()):
        log.info(f'Processing "{fold}"...')

        # model & optimizer
        model_type = models.select(model_name)  # see models/__init__.py
        model = ModelInterface(model_type, dev, **kwargs)
        optimizer = torch.optim.Adam(params=model.inst.parameters(),
                                     lr=learning_rate)

        # load the fold
        raw = [
            util.load_csv(fold / name)
            for name in ['train.csv', 'test.csv', 'dev.csv']
        ]

        # let the model parse these molecules
        data = []
        for i in range(len(raw)):
            buf = []
            for smiles, activity in raw[i].items():
                obj = model.process(smiles)
                buf.append(Item(obj, activity))
            data.append(buf)
        log.debug(f'atom_map: {model.atom_map}')

        test_batch, _test_label = util.separate_items(data[1])
        test_label = torch.tensor(_test_label)

        # training phase
        train_data = data[0] + data[1] if train_with_test else data[0]

        # set up to randomly drop negative samples
        # see util.RandomIterator for details
        drop_prob = ndrop if ndrop is not None else 0
        drop_fn = lambda x: drop_prob if x.activity == 0 else 0
        data_ptr = util.RandomIterator(
            train_data, drop_fn=drop_fn if ndrop is not None else None)

        countdown = min_iteration
        min_loss = 1e99  # track history minimal loss
        sum_loss, batch_cnt = 0.0, 0
        for _ in range(max_iteration):
            # generate batch
            batch, _label = util.separate_items(data_ptr.iterate(batch_size))
            label = torch.tensor(_label)

            # train a mini-batch
            batch_loss = train_step(model, optimizer, batch, label)
            sum_loss += batch_loss
            batch_cnt += 1
            # log.debug(f'{batch_loss}, {sum_loss}')

            # convergence test
            if data_ptr.is_cycled():
                loss = sum_loss / batch_cnt
                pred = model.predict(test_batch)
                log.debug(
                    f'{util.stat_string(_test_label, pred)}. loss={loss},min={min_loss}'
                )

                if countdown <= 0 and abs(min_loss - loss) < epsilon:
                    log.debug('Converged.')
                    break

                countdown -= 1
                min_loss = min(min_loss, loss)
                sum_loss, batch_cnt = 0.0, 0

        # model evaluation on `dev.csv`
        roc_auc, prc_auc = evaluate_model(model, data[2])
        log.info(f'ROC-AUC: {roc_auc}')
        log.info(f'PRC-AUC: {prc_auc}')
Example #3
0
def process_csv(model: ModelInterface, csv: Dict[Text, int],
                **kwargs) -> List[Item]:
    return [
        Item(model.process(smiles, **kwargs), activity)
        for smiles, activity in csv.items()
    ]