def _cache(data: TextIO, model_name: Text, output: BinaryIO, **kwargs): cpu = require_device(prefer_cuda=False) model_type = models.select(model_name) model = ModelInterface(model_type, cpu, False) csv = util.load_csv(data) cache = {} for smiles in csv.keys(): cache_key = (smiles, ) # memcached is indexed on argument list data = model.process(smiles) cache[cache_key] = model.encode_data(data, **kwargs) pickle.dump(cache, output)
def train(directory: Text, model_name: Text, batch_size: int, learning_rate: float, epsilon: float, cuda: bool, train_with_test: bool, min_iteration: int, max_iteration: int, ndrop: Optional[float] = None, **kwargs) -> None: # filter out options that are not set in command line kwargs = util.dict_filter(kwargs, lambda k, v: v is not None) data_folder = Path(directory) assert data_folder.is_dir(), 'Invalid data folder' dev = require_device(cuda) for fold in sorted(data_folder.iterdir()): log.info(f'Processing "{fold}"...') # model & optimizer model_type = models.select(model_name) # see models/__init__.py model = ModelInterface(model_type, dev, **kwargs) optimizer = torch.optim.Adam(params=model.inst.parameters(), lr=learning_rate) # load the fold raw = [ util.load_csv(fold / name) for name in ['train.csv', 'test.csv', 'dev.csv'] ] # let the model parse these molecules data = [] for i in range(len(raw)): buf = [] for smiles, activity in raw[i].items(): obj = model.process(smiles) buf.append(Item(obj, activity)) data.append(buf) log.debug(f'atom_map: {model.atom_map}') test_batch, _test_label = util.separate_items(data[1]) test_label = torch.tensor(_test_label) # training phase train_data = data[0] + data[1] if train_with_test else data[0] # set up to randomly drop negative samples # see util.RandomIterator for details drop_prob = ndrop if ndrop is not None else 0 drop_fn = lambda x: drop_prob if x.activity == 0 else 0 data_ptr = util.RandomIterator( train_data, drop_fn=drop_fn if ndrop is not None else None) countdown = min_iteration min_loss = 1e99 # track history minimal loss sum_loss, batch_cnt = 0.0, 0 for _ in range(max_iteration): # generate batch batch, _label = util.separate_items(data_ptr.iterate(batch_size)) label = torch.tensor(_label) # train a mini-batch batch_loss = train_step(model, optimizer, batch, label) sum_loss += batch_loss batch_cnt += 1 # log.debug(f'{batch_loss}, {sum_loss}') # convergence test if data_ptr.is_cycled(): loss = sum_loss / batch_cnt pred = model.predict(test_batch) log.debug( f'{util.stat_string(_test_label, pred)}. loss={loss},min={min_loss}' ) if countdown <= 0 and abs(min_loss - loss) < epsilon: log.debug('Converged.') break countdown -= 1 min_loss = min(min_loss, loss) sum_loss, batch_cnt = 0.0, 0 # model evaluation on `dev.csv` roc_auc, prc_auc = evaluate_model(model, data[2]) log.info(f'ROC-AUC: {roc_auc}') log.info(f'PRC-AUC: {prc_auc}')
def process_csv(model: ModelInterface, csv: Dict[Text, int], **kwargs) -> List[Item]: return [ Item(model.process(smiles, **kwargs), activity) for smiles, activity in csv.items() ]