def get_targets(self):
        """
        Parse data file and return targets and corresponding SMILES.

        Procedure
        ---------
        1. Read data and get unique rows by compound ID.
        2. Map compound IDs to SMILES.
        3. Extract targets from data.
        """
        data = self.read_data()
        id_map = read_pickle(self.map_filename)

        # get compound SMILES from map
        # indices are for data rows successfully mapped to SMILES
        smiles, indices = self.map_ids_to_smiles(data[self.primary_key],
                                                 id_map)

        # get targets
        if self.column_indices is not None:
            targets = np.zeros((data.shape[0], len(self.column_indices)),
                               dtype=float)
            for i, idx in enumerate(self.column_indices):
                targets[:, i] = data[data.columns[idx]]
        else:
            targets = np.asarray(
                data[self.activity_key] == self.activity_value)
        targets = targets[indices]  # reduce targets to matched structures
        return smiles, targets
Example #2
0
    def get_targets(self):
        """
        Parse data file and return targets and corresponding SMILES.

        Procedure
        ---------
        1. Read data and get unique rows by compound ID.
        2. Map compound IDs to SMILES.
        3. Extract targets from data.
        """
        data = self.read_data()
        id_map = read_pickle(self.map_filename)

        # get compound SMILES from map
        # indices are for data rows successfully mapped to SMILES
        smiles, indices = self.map_ids_to_smiles(data[self.primary_key],
                                                 id_map)

        # get targets
        if self.column_indices is not None:
            targets = np.zeros((data.shape[0], len(self.column_indices)),
                               dtype=float)
            for i, idx in enumerate(self.column_indices):
                targets[:, i] = data[data.columns[idx]]
        else:
            targets = np.asarray(
                data[self.activity_key] == self.activity_value)
        targets = targets[indices]  # reduce targets to matched structures
        return smiles, targets
Example #3
0
 def test_read_pickle_gz(self):
     """
     Test read_pickle with gzipped pickle.
     """
     _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz')
     with gzip.open(filename, 'wb') as f:
         cPickle.dump({'foo': 'bar'}, f, cPickle.HIGHEST_PROTOCOL)
     assert read_pickle(filename)['foo'] == 'bar'
Example #4
0
 def test_read_pickle_gz(self):
     """
     Test read_pickle with gzipped pickle.
     """
     _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz')
     with gzip.open(filename, 'wb') as f:
         cPickle.dump({'foo': 'bar'}, f, cPickle.HIGHEST_PROTOCOL)
     assert read_pickle(filename)['foo'] == 'bar'
Example #5
0
  def check_output(self, featurize_args, shape, targets=None, mol_ids=None,
                   smiles=None, output_suffix='.pkl'):
    """
    Check features shape, targets, and mol_ids.

    Parameters
    ----------
    featurize_args : list
        Featurizer-specific arguments for script.
    filename : str
        Output filename.
    shape : tuple
        Expected shape of features.
    targets : list, optional
        Expected targets. Defaults to self.targets.
    mol_ids: list, optional
        Expected mol_ids. Defaults to self.mol_ids.
    smiles : list, optional
        Expected SMILES. Defaults to self.smiles.
    output_suffix : str, optional (default '.pkl')
        Suffix for output files.
    """

    # generate command-line arguments
    _, output_filename = tempfile.mkstemp(suffix=output_suffix,
                                          dir=self.temp_dir)
    input_args = [self.input_filename, '-t', self.targets_filename,
                  output_filename] + featurize_args

    # run script
    args = parse_args(input_args)
    main(args.klass, args.input, args.output, target_filename=args.targets,
         featurizer_kwargs=vars(args.featurizer_kwargs),
         include_smiles=True, scaffolds=args.scaffolds,
         chiral_scaffolds=args.chiral_scaffolds)

    # read output file
    if output_filename.endswith('.joblib'):
        data = joblib.load(output_filename)
    else:
        data = read_pickle(output_filename)

    # check values
    if targets is None:
        targets = self.targets
    if mol_ids is None:
        mol_ids = self.mol_ids
    if smiles is None:
        smiles = self.smiles
    assert len(data) == shape[0]
    if len(shape) > 1:
        assert data.ix[0, 'features'].shape == shape[1:]
    assert np.array_equal(data['y'], targets), data['y']
    assert np.array_equal(data['mol_id'], mol_ids), data['mol_id']
    assert np.array_equal(data['smiles'], smiles), data['smiles']

    # return output in case anything else needs to be checked
    return data
 def test_map_ids_to_smiles(self):
     """
     Test AssayDataParser.map_ids_to_smiles.
     """
     data = self.engine.read_data()
     id_map = read_pickle(self.map_filename)
     smiles, indices = self.engine.map_ids_to_smiles(data.PUBCHEM_CID, id_map)
     assert len(smiles) == len(indices) == 2
     assert smiles[0] == self.map["CID645443"]
     assert smiles[1] == self.map["CID2997889"]
     assert np.array_equal(indices, [0, 3])
Example #7
0
 def test_main(self):
     """
     Test main.
     """
     args = parse_args(['-i', self.input_filename, '-o',
                        self.output_filename, '-p', 'CID'])
     main(args.input, args.output, args.prefix)
     data = read_pickle(self.output_filename)
     assert len(data) == len(self.smiles)
     for smile, cid in zip(self.smiles, self.cids):
         assert data['CID{}'.format(cid)] == Chem.MolToSmiles(
             Chem.MolFromSmiles(smile), isomericSmiles=True)
 def test_map_ids_to_smiles(self):
     """
     Test AssayDataParser.map_ids_to_smiles.
     """
     data = self.engine.read_data()
     id_map = read_pickle(self.map_filename)
     smiles, indices = self.engine.map_ids_to_smiles(
         data.PUBCHEM_CID, id_map)
     assert len(smiles) == len(indices) == 2
     assert smiles[0] == self.map['CID645443']
     assert smiles[1] == self.map['CID2997889']
     assert np.array_equal(indices, [0, 3])
 def test_main(self):
     """
     Test main.
     """
     args = parse_args([
         '-i', self.input_filename, '-o', self.output_filename, '-p', 'CID'
     ])
     main(args.input, args.output, args.prefix)
     data = read_pickle(self.output_filename)
     assert len(data) == len(self.smiles)
     for smile, cid in zip(self.smiles, self.cids):
         assert data['CID{}'.format(cid)] == Chem.MolToSmiles(
             Chem.MolFromSmiles(smile), isomericSmiles=True)
    def check_output(self, input_args):
        """
        Check main output.

        Parameters
        ----------
        input_args : list
            Command-line arguments.
        """
        args = parse_args(input_args)
        main(args.actives, args.decoys, args.output, args.stereo_from_3d)
        data = read_pickle(self.output_filename)
        for smiles, target in zip(data['smiles'], data['targets']):
            assert smiles in self.smiles
            assert target == self.y[self.smiles.index(smiles)]
        return data['smiles'], data['targets']
    def check_output(self, input_args):
        """
        Check main output.

        Parameters
        ----------
        input_args : list
            Command-line arguments.
        """
        args = parse_args(input_args)
        main(args.actives, args.decoys, args.output, args.stereo_from_3d)
        data = read_pickle(self.output_filename)
        for smiles, target in zip(data['smiles'], data['targets']):
            assert smiles in self.smiles
            assert target == self.y[self.smiles.index(smiles)]
        return data['smiles'], data['targets']
def main(input_filenames,
         output_filename,
         id_prefix=None,
         allow_duplicates=True,
         update=False,
         assign_stereo_from_3d=False):
    """
    Get SMILES for compounds and map to compound names.

    Parameters
    ----------
    input_filenames : list
        Input molecule filenames.
    output_filename : str
        Output filename.
    id_prefix : str, optional
        Prefix to prepend to IDs.
    allow_duplicates : bool, optional (default True)
        Allow duplicate SMILES.
    update : bool, optional (default False)
        Update an existing map with the same output filename. If False, a new
        map will be generated using only the input file(s).
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    smiles = SmilesMap(prefix=id_prefix,
                       allow_duplicates=allow_duplicates,
                       assign_stereo_from_3d=assign_stereo_from_3d)

    # update existing map
    if update:
        smiles.map = read_pickle(output_filename)

    for input_filename in input_filenames:
        print input_filename
        with serial.MolReader().open(input_filename) as reader:
            for mol in reader:
                try:
                    smiles.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    write_pickle(smiles.get_map(), output_filename)
Example #13
0
def main(input_filenames, output_filename, id_prefix=None,
         allow_duplicates=True, update=False, assign_stereo_from_3d=False):
    """
    Get SMILES for compounds and map to compound names.

    Parameters
    ----------
    input_filenames : list
        Input molecule filenames.
    output_filename : str
        Output filename.
    id_prefix : str, optional
        Prefix to prepend to IDs.
    allow_duplicates : bool, optional (default True)
        Allow duplicate SMILES.
    update : bool, optional (default False)
        Update an existing map with the same output filename. If False, a new
        map will be generated using only the input file(s).
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    smiles = SmilesMap(prefix=id_prefix, allow_duplicates=allow_duplicates,
                       assign_stereo_from_3d=assign_stereo_from_3d)

    # update existing map
    if update:
        smiles.map = read_pickle(output_filename)

    for input_filename in input_filenames:
        print input_filename
        with serial.MolReader().open(input_filename) as reader:
            for mol in reader:
                try:
                    smiles.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    write_pickle(smiles.get_map(), output_filename)
    def test_main(self):
        """
        Test main.
        """
        args = get_args([self.filename, '-d', self.temp_dir])
        main(args.input, args.merge, args.dir)

        # check for the right number of files
        assert len(glob.glob(os.path.join(self.temp_dir, '*.pkl.gz'))) == 6

        # inspect files individually
        for filename in glob.glob(os.path.join(self.temp_dir, '*.pkl.gz')):
            data = read_pickle(filename)
            assert len(data['smiles']) == len(data['targets'])

            # try to read SMILES
            for this_smiles in data['smiles']:
                Chem.MolFromSmiles(this_smiles)

            # check type of targets
            assert data['targets'].dtype == int
Example #15
0
    def test_update(self):
        """
        Test update existing map.
        """
        args = parse_args(['-i', self.input_filename, '-o',
                           self.output_filename, '-p', 'CID'])
        main(args.input, args.output, args.prefix, args.update)

        # add another molecule
        self.smiles.append('CC(=O)NC1=CC=C(C=C1)O')
        self.cids.append(1983)
        with open(self.input_filename, 'wb') as f:
            for smile, cid in zip(self.smiles, self.cids):
                f.write('{}\t{}\n'.format(smile, cid))

        # update existing map
        main(args.input, args.output, args.prefix, True)
        data = read_pickle(self.output_filename)
        assert len(data) == len(self.smiles)
        for smile, cid in zip(self.smiles, self.cids):
            assert data['CID{}'.format(cid)] == Chem.MolToSmiles(
                Chem.MolFromSmiles(smile), isomericSmiles=True)
    def test_update(self):
        """
        Test update existing map.
        """
        args = parse_args([
            '-i', self.input_filename, '-o', self.output_filename, '-p', 'CID'
        ])
        main(args.input, args.output, args.prefix, args.update)

        # add another molecule
        self.smiles.append('CC(=O)NC1=CC=C(C=C1)O')
        self.cids.append(1983)
        with open(self.input_filename, 'wb') as f:
            for smile, cid in zip(self.smiles, self.cids):
                f.write('{}\t{}\n'.format(smile, cid))

        # update existing map
        main(args.input, args.output, args.prefix, True)
        data = read_pickle(self.output_filename)
        assert len(data) == len(self.smiles)
        for smile, cid in zip(self.smiles, self.cids):
            assert data['CID{}'.format(cid)] == Chem.MolToSmiles(
                Chem.MolFromSmiles(smile), isomericSmiles=True)
Example #17
0
def main(featurizer_class,
         input_filename,
         output_filename,
         target_filename=None,
         featurizer_kwargs=None,
         parallel=False,
         client_kwargs=None,
         view_flags=None,
         compression_level=3,
         smiles_hydrogens=False,
         include_smiles=False,
         scaffolds=False,
         chiral_scaffolds=False,
         mol_id_prefix=None):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    include_smiles : bool, optional (default False)
        Include SMILES in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    mol_id_prefix : str, optional
        Prefix for molecule IDs.
    """
    mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(mols, mol_ids,
                                                       targets['y'],
                                                       targets['mol_id'])
            mols = mols[mol_indices]
            mol_ids = mol_ids[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['mol_id'] = mol_ids
    data['features'] = features

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['mol_id'].shape[0] == len(mols), (
        "Molecule IDs do not match molecules.")

    # smiles, scaffolds, args
    if include_smiles:
        smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
        data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)

    # construct a DataFrame
    try:
        if data['features'].ndim > 1:
            # numpy arrays will be "summarized" when written as strings
            # use str(row.tolist())[1:-1] to remove the surrounding brackets
            # remove commas (keeping spaces) to avoid conflicts with csv
            if (output_filename.endswith('.csv')
                    or output_filename.endswith('.csv.gz')):
                data['features'] = [
                    str(row.tolist())[1:-1].replace(', ', ' ')
                    for row in data['features']
                ]
            else:
                data['features'] = [row for row in data['features']]
    except AttributeError:
        pass
    df = pd.DataFrame(data)

    # write output file
    write_output_file(df, output_filename, compression_level)
Example #18
0
def main(featurizer_class, input_filename, output_filename,
         target_filename=None, featurizer_kwargs=None, parallel=False,
         client_kwargs=None, view_flags=None, compression_level=3,
         smiles_hydrogens=False, names=False, scaffolds=False,
         chiral_scaffolds=False):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    names : bool, optional (default False)
        Whether to include molecule names in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    """
    mols, mol_names = read_mols(input_filename)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(
                mols, mol_names, targets['y'], targets['names'])
            mols = mols[mol_indices]
            mol_names = mol_names[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['features'] = features

    # calculate SMILES
    smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
    data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['smiles'].shape[0] == len(mols), (
        "SMILES do not match molecules.")

    # names, scaffolds, args
    if names:
        data['names'] = mol_names
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)
    data['args'] = {'featurizer_class': featurizer_class.__name__,
                    'input_filename': input_filename,
                    'target_filename': target_filename,
                    'featurizer_kwargs': featurizer_kwargs,
                    'chiral_scaffolds': chiral_scaffolds}

    # write output file
    write_output_file(data, output_filename, compression_level)
Example #19
0
def main(featurizer_class, input_filename, output_filename,
         target_filename=None, featurizer_kwargs=None, parallel=False,
         client_kwargs=None, view_flags=None, compression_level=3,
         smiles_hydrogens=False, include_smiles=False, scaffolds=False,
         chiral_scaffolds=False, mol_id_prefix=None):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    include_smiles : bool, optional (default False)
        Include SMILES in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    mol_id_prefix : str, optional
        Prefix for molecule IDs.
    """
    mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(
                mols, mol_ids, targets['y'], targets['mol_id'])
            mols = mols[mol_indices]
            mol_ids = mol_ids[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['mol_id'] = mol_ids
    data['features'] = features

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['mol_id'].shape[0] == len(mols), (
        "Molecule IDs do not match molecules.")

    # smiles, scaffolds, args
    if include_smiles:
        smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
        data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)

    # construct a DataFrame
    try:
        if data['features'].ndim > 1:
            # numpy arrays will be "summarized" when written as strings
            # use str(row.tolist())[1:-1] to remove the surrounding brackets
            # remove commas (keeping spaces) to avoid conflicts with csv
            if (output_filename.endswith('.csv')
                    or output_filename.endswith('.csv.gz')):
                data['features'] = [str(row.tolist())[1:-1].replace(', ', ' ')
                                    for row in data['features']]
            else:
                data['features'] = [row for row in data['features']]
    except AttributeError:
        pass
    df = pd.DataFrame(data)

    # write output file
    write_output_file(df, output_filename, compression_level)
Example #20
0
    def check_output(self,
                     featurize_args,
                     shape,
                     targets=None,
                     names=None,
                     smiles=None,
                     output_suffix='.pkl'):
        """
    Check features shape, targets, and names.

    Parameters
    ----------
    featurize_args : list
        Featurizer-specific arguments for script.
    filename : str
        Output filename.
    shape : tuple
        Expected shape of features.
    targets : list, optional
        Expected targets. Defaults to self.targets.
    names : list, optional
        Expected names. Defaults to self.names.
    smiles : list, optional
        Expected SMILES. Defaults to self.smiles.
    output_suffix : str, optional (default '.pkl')
        Suffix for output files.
    """

        # generate command-line arguments
        _, output_filename = tempfile.mkstemp(suffix=output_suffix,
                                              dir=self.temp_dir)
        input_args = [
            self.input_filename, '-t', self.targets_filename, output_filename,
            '--names'
        ] + featurize_args

        # run script
        args = parse_args(input_args)
        main(args.klass,
             args.input,
             args.output,
             target_filename=args.targets,
             featurizer_kwargs=vars(args.featurizer_kwargs),
             names=args.names,
             scaffolds=args.scaffolds,
             chiral_scaffolds=args.chiral_scaffolds)

        # read output file
        if output_filename.endswith('.joblib'):
            data = joblib.load(output_filename)
        else:
            data = read_pickle(output_filename)

        # check values
        if targets is None:
            targets = self.targets
        if names is None:
            names = self.names
        if smiles is None:
            smiles = self.smiles
        assert len(data) == shape[0]
        if len(shape) > 1:
            assert data.ix[0, 'features'].shape == shape[1:]
        assert np.array_equal(data['y'], targets), data['y']
        assert np.array_equal(data['names'], names), data['names']
        assert np.array_equal(data['smiles'], smiles), data['smiles']

        # return output in case anything else needs to be checked
        return data