def fit_model(model_name, model_params, model_dir, data_dir):
    """Builds model from featurized data."""
    task_type = Model.get_task_type(model_name)
    train_dir = os.path.join(data_dir, "train-data")
    train = Dataset(train_dir)

    task_types = {task: task_type for task in train.get_task_names()}
    model_params["data_shape"] = train.get_data_shape()

    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train)
    model.save(model_dir)
Exemple #2
0
def featurize_and_split(input_file, feature_dir, samples_dir, train_dir,
                        test_dir, splittype, feature_types, input_transforms,
                        output_transforms, tasks, feature_files=None):
  """Featurize inputs with NNScore and do train-test split."""

  loader = DataLoader(tasks=tasks,
                      smiles_field="smiles",
                      protein_pdb_field="protein_pdb",
                      ligand_pdb_field="ligand_pdb",
                      verbose=True)
  
  if feature_files is None:
    print("About to featurize.")
    samples = loader.featurize(input_file, feature_dir,
                                   samples_dir, shard_size=8)
    print("Completed Featurization")
  else:
    # Transform data into arrays for ML
    samples = FeaturizedSamples(samples_dir, feature_files,
                                reload_data=False)

  # Split into train/test
  train_samples, test_samples = samples.train_test_split(
      splittype, train_dir, test_dir)
  print("Finished train test split.")
  train_dataset = Dataset(train_dir, train_samples, feature_types)
  test_dataset = Dataset(test_dir, test_samples, feature_types)
  print("Finished creating train test datasets")
  # Transforming train/test data
  train_dataset.transform(input_transforms, output_transforms)
  test_dataset.transform(input_transforms, output_transforms)
  print("Finished Transforming train test data.")

  return train_dataset, test_dataset
Exemple #3
0
    def _featurize_train_test_split(self,
                                    splittype,
                                    compound_featurizers,
                                    complex_featurizers,
                                    input_transforms,
                                    output_transforms,
                                    input_file,
                                    tasks,
                                    protein_pdb_field=None,
                                    ligand_pdb_field=None,
                                    user_specified_features=None,
                                    shard_size=100):
        # Featurize input
        featurizers = compound_featurizers + complex_featurizers

        input_file = os.path.join(self.current_dir, input_file)
        featurizer = DataFeaturizer(
            tasks=tasks,
            smiles_field=self.smiles_field,
            protein_pdb_field=protein_pdb_field,
            ligand_pdb_field=ligand_pdb_field,
            compound_featurizers=compound_featurizers,
            complex_featurizers=complex_featurizers,
            user_specified_features=user_specified_features,
            verbose=True)

        #Featurizes samples and transforms them into NumPy arrays suitable for ML.
        #returns an instance of class FeaturizedSamples()

        samples = featurizer.featurize(input_file,
                                       self.feature_dir,
                                       self.samples_dir,
                                       shard_size=shard_size)

        # Splits featurized samples into train/test
        train_samples, test_samples = samples.train_test_split(
            splittype, self.train_dir, self.test_dir)

        use_user_specified_features = False
        if user_specified_features is not None:
            use_user_specified_features = True

        train_dataset = Dataset(
            data_dir=self.train_dir,
            samples=train_samples,
            featurizers=featurizers,
            tasks=tasks,
            use_user_specified_features=use_user_specified_features)
        test_dataset = Dataset(
            data_dir=self.test_dir,
            samples=test_samples,
            featurizers=featurizers,
            tasks=tasks,
            use_user_specified_features=use_user_specified_features)

        # Transforming train/test data
        train_dataset.transform(input_transforms, output_transforms)
        test_dataset.transform(input_transforms, output_transforms)

        return train_dataset, test_dataset
def train_test_split(input_transforms, output_transforms, feature_types, splittype, data_dir):
    """Saves transformed model."""

    samples_dir = os.path.join(data_dir, "samples")
    samples = FeaturizedSamples(samples_dir, reload_data=True)

    print("Split data into train/test")
    train_samples_dir = os.path.join(data_dir, "train-samples")
    test_samples_dir = os.path.join(data_dir, "test-samples")
    train_samples, test_samples = samples.train_test_split(splittype, train_samples_dir, test_samples_dir)

    train_data_dir = os.path.join(data_dir, "train-data")
    test_data_dir = os.path.join(data_dir, "test-data")

    print("Generating train dataset.")
    train_dataset = Dataset(train_data_dir, train_samples, feature_types)

    print("Generating test dataset.")
    test_dataset = Dataset(test_data_dir, test_samples, feature_types)

    print("Transforming train data.")
    train_dataset.transform(input_transforms, output_transforms)

    print("Transforming test data.")
    test_dataset.transform(input_transforms, output_transforms)
  def test_API(self):
    """Straightforward test of multitask deepchem classification API."""
    splittype = "scaffold"
    feature_types = ["ECFP"]
    output_transforms = []
    input_transforms = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2}
    model_name = "multitask_deep_classifier"

    # Featurize input
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in self.tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        evaluator.compute_model_performance(test_csv_out, test_stats_out)
  def _create_model(self, splittype, feature_types, input_transforms,
                    output_transforms, task_type, model_params, model_name,
                    input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None):
    """Helper method to create model for test."""
    # Featurize input
    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                verbose=True)
    feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        _, _ = evaluator.compute_model_performance(
            test_csv_out, test_stats_out)
def create_model(args):
    """Creates a model"""
    model_name = args.model
    if args.base_dir is not None:
        feature_dir = os.path.join(args.base_dir, "features")
        data_dir = os.path.join(args.base_dir, "data")
        model_dir = os.path.join(args.base_dir, "model")
        ensure_exists([args.base_dir, feature_dir, data_dir, model_dir])
    else:
        if args.model_dir is None or args.data_dir is None or args.feature_dir is None:
            raise ValueError("If base-dir not specified, must specify " "feature-dir, data-dir, model-dir.")

        feature_dir, model_dir, data_dir = (args.feature_dir, args.model_dir, args.data_dir)
        ensure_exists([feature_dir, data_dir, model_dir])

    if args.featurize:
        print("+++++++++++++++++++++++++++++++++")
        print("Perform featurization")
        featurize_inputs(
            feature_dir,
            data_dir,
            args.input_files,
            args.user_specified_features,
            args.tasks,
            args.smiles_field,
            args.split_field,
            args.id_field,
            args.threshold,
            args.protein_pdb_field,
            args.ligand_pdb_field,
            args.ligand_mol2_field,
        )

    if args.generate_dataset:
        print("+++++++++++++++++++++++++++++++++")
        print("Generate dataset for featurized samples")
        samples_dir = os.path.join(data_dir, "samples")
        samples = FeaturizedSamples(samples_dir, reload_data=True)

        print("Generating dataset.")
        full_data_dir = os.path.join(data_dir, "full-data")
        full_dataset = Dataset(full_data_dir, samples, args.feature_types)

        print("Transform data.")
        full_dataset.transform(args.input_transforms, args.output_transforms)

    if args.train_test_split:
        print("+++++++++++++++++++++++++++++++++")
        print("Perform train-test split")
        train_test_split(args.input_transforms, args.output_transforms, args.feature_types, args.splittype, data_dir)

    if args.fit:
        print("+++++++++++++++++++++++++++++++++")
        print("Fit model")
        model_params = extract_model_params(args)
        fit_model(model_name, model_params, model_dir, data_dir)

    if args.eval:
        print("+++++++++++++++++++++++++++++++++")
        print("Eval Model on Train")
        print("-------------------")
        train_dir = os.path.join(data_dir, "train-data")
        csv_out_train = os.path.join(data_dir, "train.csv")
        stats_out_train = os.path.join(data_dir, "train-stats.txt")
        eval_trained_model(model_name, model_dir, train_dir, csv_out_train, stats_out_train)

        print("Eval Model on Test")
        print("------------------")
        test_dir = os.path.join(data_dir, "test-data")
        csv_out_test = os.path.join(data_dir, "test.csv")
        stats_out_test = os.path.join(data_dir, "test-stats.txt")
        eval_trained_model(model_name, model_dir, test_dir, csv_out_test, stats_out_test)

    if args.eval_full:
        print("+++++++++++++++++++++++++++++++++")
        print("Eval Model on Full Dataset")
        print("--------------------------")
        full_data_dir = os.path.join(data_dir, "full-data")
        csv_out_full = os.path.join(data_dir, "full.csv")
        stats_out_full = os.path.join(data_dir, "full-stats.txt")
        eval_trained_model(model_name, model_dir, full_data_dir, csv_out_full, stats_out_full)