Esempio n. 1
0
def test_lcnn_reload():

  # needs change
  current_dir = tempfile.mkdtemp()
  download_url(url=URL, dest_dir=current_dir)
  untargz_file(path.join(current_dir, 'lcnn_data_feature.tar.gz'), current_dir)
  tasks, datasets, transformers = load_dataset_from_disk(
      path.join(current_dir, 'lcnn_data'))
  train, valid, test = datasets
  model_dir = tempfile.mkdtemp()
  model = LCNNModel(
      mode='regression', batch_size=8, learning_rate=0.001, model_dir=model_dir)
  model.fit(train, nb_epoch=10)

  # check predict shape
  valid_preds = model.predict_on_batch(valid.X)
  assert valid_preds.shape == (65, 1)
  test_preds = model.predict(test)
  assert test_preds.shape == (65, 1)
  # check overfit
  regression_metric = Metric(mae_score)
  scores = model.evaluate(test, [regression_metric], transformers)
  assert scores[regression_metric.name] < 0.6

  # reload
  reloaded_model = LCNNModel(
      mode='regression', batch_size=8, learning_rate=0.001, model_dir=model_dir)
  reloaded_model.restore()

  original_pred = model.predict(test)
  reload_pred = reloaded_model.predict(test)

  assert np.all(np.abs(original_pred - reload_pred) < 0.0000001)
    def __init__(self,
                 pretrain_model_path: Optional[str] = None,
                 radius: int = 1,
                 unseen: str = 'UNK',
                 gather_method: str = 'sum'):
        """
    Paremeters
    ----------
    pretrain_file: str, optional
      The path for pretrained model. If this value is None, we use the model which is put on
      github repository (https://github.com/samoturk/mol2vec/tree/master/examples/models).
      The model is trained on 20 million compounds downloaded from ZINC.
    radius: int, optional (default 1)
      The fingerprint radius. The default value was used to train the model which is put on
      github repository.
    unseen: str, optional (default 'UNK')
      The string to used to replace uncommon words/identifiers while training.
    gather_method: str, optional (default 'sum')
      How to aggregate vectors of identifiers are extracted from Mol2vec.
      'sum' or 'mean' is supported.
    """
        try:
            from gensim.models import word2vec
            from mol2vec.features import mol2alt_sentence, sentences2vec
        except ModuleNotFoundError:
            raise ValueError("This class requires mol2vec to be installed.")

        self.radius = radius
        self.unseen = unseen
        self.gather_method = gather_method
        self.sentences2vec = sentences2vec
        self.mol2alt_sentence = mol2alt_sentence
        if pretrain_model_path is None:
            data_dir = get_data_dir()
            pretrain_model_path = path.join(data_dir,
                                            'mol2vec_model_300dim.pkl')
            if not path.exists(pretrain_model_path):
                targz_file = path.join(data_dir, 'mol2vec_model_300dim.tar.gz')
                if not path.exists(targz_file):
                    download_url(DEFAULT_PRETRAINED_MODEL_URL, data_dir)
                untargz_file(
                    path.join(data_dir, 'mol2vec_model_300dim.tar.gz'),
                    data_dir)
        # load pretrained models
        self.model = word2vec.Word2Vec.load(pretrain_model_path)