def featurize_input(
    input_file,
    feature_dir,
    user_specified_features,
    tasks,
    smiles_field,
    split_field,
    id_field,
    threshold,
    protein_pdb_field,
    ligand_pdb_field,
    ligand_mol2_field,
):
    """Featurizes raw input data."""
    featurizer = DataFeaturizer(
        tasks=tasks,
        smiles_field=smiles_field,
        split_field=split_field,
        id_field=id_field,
        threshold=threshold,
        protein_pdb_field=protein_pdb_field,
        ligand_pdb_field=ligand_pdb_field,
        ligand_mol2_field=ligand_mol2_field,
        user_specified_features=user_specified_features,
        verbose=True,
    )
    out = os.path.join(feature_dir, "%s.joblib" % (os.path.splitext(os.path.basename(input_file))[0]))
    featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)
  def test_API(self):
    """Straightforward test of multitask deepchem classification API."""
    splittype = "scaffold"
    feature_types = ["ECFP"]
    output_transforms = []
    input_transforms = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2}
    model_name = "multitask_deep_classifier"

    # Featurize input
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in self.tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        evaluator.compute_model_performance(test_csv_out, test_stats_out)
Exemple #3
0
def featurize_input(input_file, feature_dir, user_specified_features, tasks,
                    smiles_field, split_field, id_field, threshold, protein_pdb_field,
                     ligand_pdb_field, ligand_mol2_field):
  """Featurizes raw input data."""
  featurizer = DataFeaturizer(tasks=tasks,
                              smiles_field=smiles_field,
                              split_field=split_field,
                              id_field=id_field,
                              threshold=threshold,
                              protein_pdb_field=protein_pdb_field,
                              ligand_pdb_field=ligand_pdb_field,
                              ligand_mol2_field=ligand_mol2_field,
                              user_specified_features=user_specified_features,
                              verbose=True)

  featurizer.featurize(input_file, FeaturizedSamples.feature_types, feature_dir)
  def _create_model(self, splittype, feature_types, input_transforms,
                    output_transforms, task_type, model_params, model_name,
                    input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None):
    """Helper method to create model for test."""
    # Featurize input
    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                verbose=True)
    feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        _, _ = evaluator.compute_model_performance(
            test_csv_out, test_stats_out)