Beispiel #1
0
def main():
    """Main function"""
    args = parse_args()

    smiles_list = uc.read_smi_file(args.input_smiles_path)

    LOG.info("Building vocabulary")
    tokenizer = mv.SMILESTokenizer()
    vocabulary = mv.create_vocabulary(smiles_list, tokenizer=tokenizer)

    tokens = vocabulary.tokens()
    LOG.info("Vocabulary contains %d tokens: %s", len(tokens), tokens)
    network_params = {
        'num_layers': args.num_layers,
        'layer_size': args.layer_size,
        'cell_type': args.cell_type,
        'embedding_layer_size': args.embedding_layer_size,
        'dropout': args.dropout
    }
    model = mm.Model(no_cuda=True,
                     vocabulary=vocabulary,
                     tokenizer=tokenizer,
                     network_params=network_params,
                     max_sequence_length=args.max_sequence_length)
    LOG.info("Saving model at %s", args.output_model_path)
    model.save(args.output_model_path)
Beispiel #2
0
def load_sets(set_path):
    file_paths = [set_path]
    if os.path.isdir(set_path):
        file_paths = sorted(glob.glob("{}/*.smi".format(set_path)))

    for path in it.cycle(file_paths):  # stores the path instead of the set
        yield list(uc.read_smi_file(path))
def load_sets(set_path):
    file_paths = [set_path]
    if os.path.isdir(set_path):
        file_paths = sorted(glob.glob("{}/*.smi".format(set_path)))

    for path in file_paths:
        yield list(uc.read_smi_file(path))
def main():
    """Main function."""
    args = parse_args()

    model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval")

    input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path))
    if args.output_smiles_path:
        if args.use_gzip:
            args.output_smiles_path += ".gz"
        output_file = uc.open_file(args.output_smiles_path, "w+")
        write_func = functools.partial(output_file.write)
    else:
        output_file = tqdm.tqdm
        write_func = functools.partial(output_file.write, end="")

    sample_model = ma.SampleModel(model, args.batch_size)

    for scaff, dec, nll in ul.progress_bar(sample_model.run(input_scaffolds),
                                           total=len(input_scaffolds)):
        output_row = [scaff, dec]
        if args.with_nll:
            output_row.append("{:.8f}".format(nll))
        write_func("\t".join(output_row) + "\n")

    if args.output_smiles_path:
        output_file.close()
Beispiel #5
0
def main():
    """Main function."""
    args = parse_args()

    ut.set_default_device("cuda")

    model = mm.Model.load_from_file(args.model_path, mode="sampling")

    input_csv = uc.open_file(args.input_csv_path, mode="rt")
    if args.use_gzip:
        args.output_csv_path += ".gz"
    output_csv = uc.open_file(args.output_csv_path, mode="wt+")

    calc_nlls_action = ma.CalculateNLLsFromModel(model,
                                                 batch_size=args.batch_size,
                                                 logger=LOG)
    smiles_list = list(uc.read_smi_file(args.input_csv_path))

    for nll in ul.progress_bar(calc_nlls_action.run(smiles_list),
                               total=len(smiles_list)):
        input_line = input_csv.readline().strip()
        output_csv.write("{}\t{:.8f}\n".format(input_line, nll))

    input_csv.close()
    output_csv.close()
Beispiel #6
0
 def _initialize_dataloader(self, path):
     training_set = uc.read_smi_file(path)
     dataset = md.Dataset(smiles_list=training_set,
                          vocabulary=self._model.vocabulary,
                          tokenizer=mv.SMILESTokenizer())
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=self._batch_size,
         shuffle=self._shuffle_each_epoch,
         collate_fn=md.Dataset.collate_fn)
     return dataloader
Beispiel #7
0
def main():
    """Main function."""
    args = parse_args()

    model = mm.Model.load_from_file(args.model_path, mode="sampling")
    training_set = list(uc.read_smi_file(args.training_set_path))
    validation_set = list(uc.read_smi_file(args.validation_set_path))

    writer = tbx.SummaryWriter(log_dir=args.log_path)

    ma.CollectStatsFromModel(model,
                             args.epoch,
                             training_set,
                             validation_set,
                             writer,
                             sample_size=args.sample_size,
                             with_weights=args.with_weights,
                             to_mol_func=uc.get_mol_func(args.smiles_type),
                             logger=LOG).run()

    writer.close()
    def run(self):
        """
        Calculates likelihoods of a set of molecules.
        """
        ut.set_default_device("cuda")

        model = mm.Model.load_from_file(self._model_path, sampling_mode=True)

        nll_iterator, size = md.calculate_nlls_from_model(
            model,
            uc.read_smi_file(self._input_csv_path),
            batch_size=self._batch_size)
        with open(self._input_csv_path, "r") as input_csv:
            with open(self._output_csv_path, "w+") as output_csv:
                for nlls in ul.progress_bar(nll_iterator, size):
                    for nll in nlls:
                        line = input_csv.readline().strip()
                        output_csv.write("{},{:.12f}\n".format(line, nll))
def main():
    """Main function."""
    args = parse_args()

    model = mm.DecoratorModel.load_from_file(args.model_path, mode="eval")
    input_scaffolds = list(uc.read_smi_file(args.input_scaffold_path))

    sample_scaffolds = SampleScaffolds(
        model,
        num_randomized_smiles=args.num_randomized_smiles,
        num_decorations_per_scaffold=args.num_decorations_per_scaffold,
        decorator_type=args.decorator_type,
        batch_size=args.batch_size,
        num_partitions=args.num_partitions,
        logger=LOG)

    results_df = sample_scaffolds.run(input_scaffolds)
    results_df.write.parquet(args.output_parquet_folder)
Beispiel #10
0
    def __init__(self,
                 input_smiles_path,
                 output_model_path='storage',
                 num_layers=1,
                 layer_size=512,
                 embedding_layer_size=128,
                 dropout=0.,
                 max_sequence_length=256,
                 memory_cells=32,
                 cell_size=20,
                 read_heads=8,
                 model_type='dnc',
                 controller_type='lstm',
                 num_controller_layers=3):
        """
        Creates a CreateModelRunner.
        :param input_smiles_path: The input smiles string.
        :param output_model_path: The path to the newly created model.
        :param num_gru_layers: Number of GRU Layers.
        :param gru_layer_size: Size of each GRU layer.
        :param embedding_layer_size: Size of the embedding layer.
        :return:
        """
        self._smiles_list = uc.read_smi_file(input_smiles_path)
        self._output_model_path = output_model_path

        self._num_layers = num_layers
        self._layer_size = layer_size
        self._embedding_layer_size = embedding_layer_size
        self._dropout = dropout
        self._max_sequence_length = max_sequence_length
        self._memory_cells = memory_cells
        self._cell_size = cell_size
        self._read_heads = read_heads
        self._model_type = model_type
        self._controller_type = controller_type
        self._num_controller_layers = num_controller_layers

        self._already_run = False
 def calc_nlls(path):
     return np.concatenate(
         list(
             md.calculate_nlls_from_model(
                 self._model,
                 uc.read_smi_file(path, num=self._sample_size))[0]))