Ejemplo n.º 1
0
    def setUp(self):
        """
    Set up for tests. Writes molecules and targets to files.
    """
        self.temp_dir = tempfile.mkdtemp()
        smiles = [
            'CC(=O)OC1=CC=CC=C1C(=O)O', 'C[C@@H](C1=CC=C(C=C1)CC(C)C)C(=O)O'
        ]
        self.names = ['aspirin', 'ibuprofen']
        engine = conformers.ConformerGenerator(max_conformers=1)
        self.mols = []
        self.smiles = []  # use RDKit-generated SMILES
        for i in xrange(len(smiles)):
            mol = Chem.MolFromSmiles(smiles[i])
            mol.SetProp('_Name', self.names[i])
            self.mols.append(engine.generate_conformers(mol))
            self.smiles.append(
                Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True))

        # write mols
        _, self.input_filename = tempfile.mkstemp(suffix='.sdf',
                                                  dir=self.temp_dir)
        writer = serial.MolWriter()
        writer.open(self.input_filename)
        writer.write(self.mols)
        writer.close()

        # write targets
        self.targets = [0, 1]
        _, self.targets_filename = tempfile.mkstemp(suffix='.pkl',
                                                    dir=self.temp_dir)
        write_pickle(self.targets, self.targets_filename)
Ejemplo n.º 2
0
def main(active_filename,
         decoy_filename,
         output_filename,
         assign_stereo_from_3d=False):
    """
    Construct target files for datasets with active/decoy labels.

    Parameters
    ----------
    active_filename : str
        Active molecule filename.
    decoy_filename : str
        Decoy molecule filename.
    output_filename : str
        Output filename.
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    active_smiles = get_smiles(active_filename, assign_stereo_from_3d)
    decoy_smiles = get_smiles(decoy_filename, assign_stereo_from_3d)
    targets = np.concatenate(
        (np.ones(len(active_smiles),
                 dtype=int), np.zeros(len(decoy_smiles), dtype=int)))
    smiles = np.concatenate((active_smiles, decoy_smiles))
    write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
def main(input_filename, map_filename, output_filename, column_indices=None):
    """
    Get regression targets.

    Parameters
    ----------
    input_filename : str
        PCBA data filename.
    map_filename : str
        ID->SMILES map filename.
    output_filename : str
        Output filename.
    column_indices : list, optional
        Column indices to include. If None, compounds are classified by
        activity.
    """
    parser = PcbaParser(input_filename, map_filename,
                        column_indices=column_indices)
    if column_indices is not None:
        print "Extracting data from the following columns:"
        for col in parser.get_column_names():
            print '\t', col
    smiles, targets = parser.get_targets()

    # print the fraction of valid assay records that were found in the map
    total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID))
    print '{}/{} records matched'.format(len(targets), total)

    # save SMILES and targets
    write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
Ejemplo n.º 4
0
  def setUp(self):
    """
    Set up for tests. Writes molecules and targets to files.
    """
    self.temp_dir = tempfile.mkdtemp()
    smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O',
              'C[C@@H](C1=CC=C(C=C1)CC(C)C)C(=O)O']
    self.names = ['aspirin', 'ibuprofen']
    engine = conformers.ConformerGenerator(max_conformers=1)
    self.mols = []
    self.smiles = []  # use RDKit-generated SMILES
    for i in xrange(len(smiles)):
        mol = Chem.MolFromSmiles(smiles[i])
        mol.SetProp('_Name', self.names[i])
        self.mols.append(engine.generate_conformers(mol))
        self.smiles.append(Chem.MolToSmiles(mol, isomericSmiles=True,
                                            canonical=True))

    # write mols
    _, self.input_filename = tempfile.mkstemp(suffix='.sdf',
                                              dir=self.temp_dir)
    writer = serial.MolWriter()
    writer.open(self.input_filename)
    writer.write(self.mols)
    writer.close()

    # write targets
    self.targets = [0, 1]
    _, self.targets_filename = tempfile.mkstemp(suffix='.pkl',
                                                dir=self.temp_dir)
    write_pickle(self.targets, self.targets_filename)
Ejemplo n.º 5
0
def main(input_filename, map_filename, directory='.', prefix='nci60',
         suffix='pkl.gz'):
    """
    Get regression targets.

    Parameters
    ----------
    input_filename : str
        PCBA data filename.
    map_filename : str
        ID->SMILES map filename.
    directory : str, optional (default '.')
        Directory in which to write target files.
    prefix : str, optional (default 'nci60')
        Prefix for target files.
    suffix : str, optional (default 'pkl.gz')
        Suffix for target files.
    """
    parser = Nci60Parser(input_filename, map_filename)
    split_targets = parser.split_targets()

    # get total record count
    total = np.count_nonzero(~np.isnan(parser.read_data().NSC))

    # write a separate file for each dataset
    # note that split_targets is an OrderedDict
    for i, name in enumerate(split_targets.keys()):
        data = split_targets[name]
        # print the fraction of valid assay records that were found in the map
        print '{}\t{}/{} records matched'.format(
            name, len(data['targets']), total)
        write_pickle(
            data,
            os.path.join(directory,
                         '{}-{:02}-targets.{}'.format(prefix, i, suffix)))
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            'CID645443':
            'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1',
            'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1',
            'CID2244': 'CC(=O)Oc1ccccc1C(=O)O',
            'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1',
            'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1'
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                suffix='.pkl')
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv')

        # set up parser
        # settings match PcbaParser defaults
        self.engine = AssayDataParser(self.data_filename,
                                      self.map_filename,
                                      delimiter=',',
                                      primary_key='PUBCHEM_CID',
                                      activity_key='PUBCHEM_ACTIVITY_OUTCOME',
                                      activity_value='Active',
                                      id_prefix='CID')
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1",
            "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1",
            "CID2244": "CC(=O)Oc1ccccc1C(=O)O",
            "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",
            "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1",
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl")
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv")

        # set up parser
        # settings match PcbaParser defaults
        self.engine = AssayDataParser(
            self.data_filename,
            self.map_filename,
            delimiter=",",
            primary_key="PUBCHEM_CID",
            activity_key="PUBCHEM_ACTIVITY_OUTCOME",
            activity_value="Active",
            id_prefix="CID",
        )
Ejemplo n.º 8
0
 def test_write_pickle_gz(self):
     """
     Test write_pickle with gzipped pickle.
     """
     _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz')
     write_pickle({'foo': 'bar'}, filename)
     with gzip.open(filename) as f:
         assert cPickle.load(f)['foo'] == 'bar'
Ejemplo n.º 9
0
 def test_write_pickle_gz(self):
     """
     Test write_pickle with gzipped pickle.
     """
     _, filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl.gz')
     write_pickle({'foo': 'bar'}, filename)
     with gzip.open(filename) as f:
         assert cPickle.load(f)['foo'] == 'bar'
Ejemplo n.º 10
0
    def test_collate_mols3(self):
        """
    Test collate_mols where targets are in a different order than
    molecules.
    """

        # write targets
        targets = {'names': ['ibuprofen', 'aspirin'], 'y': [1, 0]}
        write_pickle(targets, self.targets_filename)

        # run script
        self.check_output(['circular'], (2, 2048))
Ejemplo n.º 11
0
  def test_collate_mols1(self):
    """
    Test collate_mols where molecules are pruned.
    """

    # write targets
    targets = {'names': ['ibuprofen'], 'y': [0]}
    write_pickle(targets, self.targets_filename)

    # run script
    self.check_output(['circular'], (1, 2048), targets=targets['y'],
                      names=targets['names'], smiles=[self.smiles[1]])
Ejemplo n.º 12
0
  def test_collate_mols3(self):
    """
    Test collate_mols where targets are in a different order than
    molecules.
    """

    # write targets
    targets = {'names': ['ibuprofen', 'aspirin'], 'y': [1, 0]}
    write_pickle(targets, self.targets_filename)

    # run script
    self.check_output(['circular'], (2, 2048))
Ejemplo n.º 13
0
    def test_collate_mols1(self):
        """
    Test collate_mols where molecules are pruned.
    """

        # write targets
        targets = {'names': ['ibuprofen'], 'y': [0]}
        write_pickle(targets, self.targets_filename)

        # run script
        self.check_output(['circular'], (1, 2048),
                          targets=targets['y'],
                          names=targets['names'],
                          smiles=[self.smiles[1]])
Ejemplo n.º 14
0
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {"NSC1": "CC1=CC(=O)C=CC1=O"}
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl")
        write_pickle(self.map, self.map_filename)

        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, "data/test_nci60_data.txt")

        # set up parser
        self.engine = Nci60Parser(self.data_filename, self.map_filename)
Ejemplo n.º 15
0
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {'NSC1': 'CC1=CC(=O)C=CC1=O'}
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                suffix='.pkl')
        write_pickle(self.map, self.map_filename)

        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, 'data/test_nci60_data.txt')

        # set up parser
        self.engine = Nci60Parser(self.data_filename, self.map_filename)
Ejemplo n.º 16
0
def main(input_filenames,
         output_filename,
         id_prefix=None,
         allow_duplicates=True,
         update=False,
         assign_stereo_from_3d=False):
    """
    Get SMILES for compounds and map to compound names.

    Parameters
    ----------
    input_filenames : list
        Input molecule filenames.
    output_filename : str
        Output filename.
    id_prefix : str, optional
        Prefix to prepend to IDs.
    allow_duplicates : bool, optional (default True)
        Allow duplicate SMILES.
    update : bool, optional (default False)
        Update an existing map with the same output filename. If False, a new
        map will be generated using only the input file(s).
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    smiles = SmilesMap(prefix=id_prefix,
                       allow_duplicates=allow_duplicates,
                       assign_stereo_from_3d=assign_stereo_from_3d)

    # update existing map
    if update:
        smiles.map = read_pickle(output_filename)

    for input_filename in input_filenames:
        print input_filename
        with serial.MolReader().open(input_filename) as reader:
            for mol in reader:
                try:
                    smiles.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    write_pickle(smiles.get_map(), output_filename)
Ejemplo n.º 17
0
  def test_collate_mols2(self):
    """
    Test collate_mols where targets are pruned.
    """

    # write targets
    targets = {'names': ['aspirin', 'ibuprofen'], 'y': [0, 1]}
    write_pickle(targets, self.targets_filename)

    # write mols
    writer = serial.MolWriter()
    writer.open(self.input_filename)
    writer.write([self.mols[0]])
    writer.close()

    # run script
    self.check_output(['circular'], (1, 2048), targets=[0],
                      names=['aspirin'], smiles=[self.smiles[0]])
Ejemplo n.º 18
0
def write_output_file(data, output_filename, compression_level=3):
    """
    Pickle output data, possibly to a compressed file.

    Parameters
    ----------
    data : object
        Object to pickle in output file.
    output_filename : str
        Output filename. Should end with .joblib, .pkl, or .pkl.gz.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    """
    if output_filename.endswith('.pkl') or output_filename.endswith('.pkl.gz'):
        write_pickle(data, output_filename)
    elif output_filename.endswith('.joblib'):
        joblib.dump(data, output_filename, compress=compression_level)
    else:
        raise NotImplementedError('Unrecognized output file extension.')
Ejemplo n.º 19
0
    def test_collate_mols2(self):
        """
    Test collate_mols where targets are pruned.
    """

        # write targets
        targets = {'names': ['aspirin', 'ibuprofen'], 'y': [0, 1]}
        write_pickle(targets, self.targets_filename)

        # write mols
        writer = serial.MolWriter()
        writer.open(self.input_filename)
        writer.write([self.mols[0]])
        writer.close()

        # run script
        self.check_output(['circular'], (1, 2048),
                          targets=[0],
                          names=['aspirin'],
                          smiles=[self.smiles[0]])
Ejemplo n.º 20
0
def main(input_filenames, output_filename, id_prefix=None,
         allow_duplicates=True, update=False, assign_stereo_from_3d=False):
    """
    Get SMILES for compounds and map to compound names.

    Parameters
    ----------
    input_filenames : list
        Input molecule filenames.
    output_filename : str
        Output filename.
    id_prefix : str, optional
        Prefix to prepend to IDs.
    allow_duplicates : bool, optional (default True)
        Allow duplicate SMILES.
    update : bool, optional (default False)
        Update an existing map with the same output filename. If False, a new
        map will be generated using only the input file(s).
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    smiles = SmilesMap(prefix=id_prefix, allow_duplicates=allow_duplicates,
                       assign_stereo_from_3d=assign_stereo_from_3d)

    # update existing map
    if update:
        smiles.map = read_pickle(output_filename)

    for input_filename in input_filenames:
        print input_filename
        with serial.MolReader().open(input_filename) as reader:
            for mol in reader:
                try:
                    smiles.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    write_pickle(smiles.get_map(), output_filename)
Ejemplo n.º 21
0
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1",
            "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1",
            "CID2244": "CC(=O)Oc1ccccc1C(=O)O",
            "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",
            "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1",
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl")
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv")

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)
Ejemplo n.º 22
0
def main(active_filename, decoy_filename, output_filename,
         assign_stereo_from_3d=False):
    """
    Construct target files for datasets with active/decoy labels.

    Parameters
    ----------
    active_filename : str
        Active molecule filename.
    decoy_filename : str
        Decoy molecule filename.
    output_filename : str
        Output filename.
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    active_smiles = get_smiles(active_filename, assign_stereo_from_3d)
    decoy_smiles = get_smiles(decoy_filename, assign_stereo_from_3d)
    targets = np.concatenate((np.ones(len(active_smiles), dtype=int),
                              np.zeros(len(decoy_smiles), dtype=int)))
    smiles = np.concatenate((active_smiles, decoy_smiles))
    write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
Ejemplo n.º 23
0
def main(config_filename, output_filename, pdb_filename=None):
    """
  Meta-information consists of a row for each target, with a column rach for
  associated AIDs and PDBs (lists).
  """
    # read target->PDB associations
    pdb = {}
    if pdb_filename is not None:
        with open(pdb_filename) as f:
            for line in f:
                target, code = line.split()
                pdb[target] = code.split(
                    ',')  # multiple PDBs can be separated by ','
    config = pd.read_csv(config_filename)

    # get AIDs for each target
    targets = {}
    for _, row in config.iterrows():
        target = row['target']
        try:
            int(target)
            target = 'gi_{}'.format(target)  # add 'gi_' to integer targets
        except ValueError:
            pass
        if target not in targets:
            targets[target] = []
        targets[target].append(row['aid'])

    # construct dataframe
    points = []
    for target, aids in targets.iteritems():
        points.append({
            'target': target,
            'aids': aids,
            'pdbs': pdb.get(target)
        })
    df = pd.DataFrame(points)
    write_pickle(df, output_filename)
Ejemplo n.º 24
0
def main(input_filename,
         map_filename,
         directory='.',
         prefix='nci60',
         suffix='pkl.gz'):
    """
    Get regression targets.

    Parameters
    ----------
    input_filename : str
        PCBA data filename.
    map_filename : str
        ID->SMILES map filename.
    directory : str, optional (default '.')
        Directory in which to write target files.
    prefix : str, optional (default 'nci60')
        Prefix for target files.
    suffix : str, optional (default 'pkl.gz')
        Suffix for target files.
    """
    parser = Nci60Parser(input_filename, map_filename)
    split_targets = parser.split_targets()

    # get total record count
    total = np.count_nonzero(~np.isnan(parser.read_data().NSC))

    # write a separate file for each dataset
    # note that split_targets is an OrderedDict
    for i, name in enumerate(split_targets.keys()):
        data = split_targets[name]
        # print the fraction of valid assay records that were found in the map
        print '{}\t{}/{} records matched'.format(name, len(data['targets']),
                                                 total)
        write_pickle(
            data,
            os.path.join(directory,
                         '{}-{:02}-targets.{}'.format(prefix, i, suffix)))
Ejemplo n.º 25
0
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            'CID645443':
            'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1',
            'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1',
            'CID2244': 'CC(=O)Oc1ccccc1C(=O)O',
            'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1',
            'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1'
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                suffix='.pkl')
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv')

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)