Beispiel #1
0
def load_esol_dataset(data_path, task_names=None, featurizer=None):
    """tbd"""
    if task_names is None:
        task_names = get_default_esol_task_names()

    # NB: some examples have multiple species
    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]        
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #2
0
def load_esol_dataset(data_path, task_names=None, featurizer=None):
    """Load esol dataset ,process the classification labels and the input information.

    Description:

        The data file contains a csv table, in which columns below are used:
            
            smiles: SMILES representation of the molecular structure
            
            Compound ID: Name of the compound
            
            measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label
   
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_esol_dataset('./esol/raw')
            print(len(dataset))
    
    References:
    
    [1] Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.

    """
    if task_names is None:
        task_names = get_default_esol_task_names()

    # NB: some examples have multiple species
    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #3
0
def load_zinc_dataset(data_path, featurizer=None):
    """Load zinc dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :zinc_id: the id of the compound

    Args:
        data_path(str): the path to the cached npz path.
        featurizer: the featurizer to use for processing the data.  
        
    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1]Teague Sterling and John J. Irwin. Zinc 15 – ligand discovery for everyone. Journal of Chemical Information and Modeling, 55(11):2324–2337, 2015. doi: 10.1021/acs.jcim.5b00559. PMID: 26479676.

    """
    smiles_list = _load_zinc_dataset(data_path)

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data
        if not data is None:
            data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #4
0
def load_bace_dataset(data_path, task_names=None, featurizer=None):
    """tbd"""
    if task_names is None:
        task_names = get_default_bace_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['mol']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #5
0
def load_hiv_dataset(data_path, task_names=None, featurizer=None):
    """Load hiv dataset,process the input information and the featurizer.
    
    The data file contains a csv table, in which columns below are used:

    :smiles:SMILES representation of the molecular structure
    :activity:Three-class labels for screening results: CI/CM/CA
    :HIV_active:Binary labels for screening results: 1(CA/CM) and 0CI)

    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_hiv_dataset('./hiv/raw')
            print(len(dataset))


    References:
    [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data

    """
    if task_names is None:
        task_names = get_default_hiv_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #6
0
def load_sider_dataset(data_path, task_names=None, featurizer=None):
    """Load sider dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :Hepatobiliary disorders ~ Injury, poisoning and procedural complications:Recorded side effects for the drug
    
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_sider_dataset('./sider/raw')
            print(len(dataset))

    References:
    [1]Kuhn, Michael, et al. “The SIDER database of drugs and side effects.” Nucleic acids research 44.D1 (2015): D1075-D1079.
    [2]Altae-Tran, Han, et al. “Low data drug discovery with one-shot learning.” ACS central science 3.4 (2017): 283-293.
    [3]Medical Dictionary for Regulatory Activities. http://www.meddra.org/
    [4]Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs.

    """
    if task_names is None:
        task_names = get_default_sider_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
def load_tox21_dataset(data_path, task_names=None, featurizer=None):
    """Load tox21 dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :NR-XXX: Nuclear receptor signaling bioassays results.
    :SR-XXX: Stress response bioassays results
    
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_tox21_dataset('./tox21/raw')
            print(len(dataset))

    References:
    [1]Tox21 Challenge. https://tripod.nih.gov/tox21/challenge/
    [2]please refer to the links at https://tripod.nih.gov/tox21/challenge/data.jsp for details.

    """
    if task_names is None:
        task_names = get_default_tox21_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1
    labels = labels.fillna(0)   # convert nan to 0

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]        
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #8
0
def load_bace_dataset(data_path, task_names=None, featurizer=None):
    """load bace dataset ,process the classification labels and the input information.

    The data file contains a csv table, in which columns below are used:

    :mol: The smile representation of the molecular structure;
    :pIC50: The negative log of the IC50 binding affinity;
    :class: The binary labels for inhibitor.
   
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_bace_dataset('./bace/raw')
            print(len(dataset))

    References:
    [1]Subramanian, Govindan, et al. “Computational modeling of β-secretase 1 (BACE-1) inhibitors using ligand based approaches.” Journal of chemical information and modeling 56.10 (2016): 1936-1949.
    """

    if task_names is None:
        task_names = get_default_bace_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['mol']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
def load_lipophilicity_dataset(data_path, task_names=None, featurizer=None):
    """Load lipophilicity dataset,process the input information and the featurizer.
    
    Description:

        The data file contains a csv table, in which columns below are used:
            
            smiles: SMILES representation of the molecular structure
            
            exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label
    
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_lipophilicity_dataset('./lipophilicity/raw')
            print(len(dataset))

    References:
    
    [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361

    """
    if task_names is None:
        task_names = get_default_lipophilicity_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #10
0
def load_muv_dataset(data_path, task_names=None, featurizer=None):
    """Load muv dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :mol_id:  PubChem CID of the compound.
    :MUV-XXX: Measured results (Active/Inactive) for bioassays.

    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_muv_dataset('./muv/raw')
            print(len(dataset))

    References:
    [1]Rohrer, Sebastian G., and Knut Baumann. “Maximum unbiased validation (MUV) data sets for virtual screening based on PubChem bioactivity data.” Journal of chemical information and modeling 49.2 (2009): 169-184.

    """
    if task_names is None:
        task_names = get_default_muv_task_names()

    csv_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1
    labels = labels.fillna(0)  # convert nan to 0

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #11
0
def load_smiles_to_dataset(data_path):
    """tbd"""
    files = sorted(glob('%s/*' % data_path))
    data_list = []
    for file in files:
        with open(file, 'r') as f:
            tmp_data_list = [line.strip() for line in f.readlines()]
        data_list.extend(tmp_data_list)
    dataset = InMemoryDataset(data_list=data_list)
    return dataset
Beispiel #12
0
def load_zinc_dataset(data_path,
                      featurizer=None,
                      return_smiles=False,
                      indices=None):
    """Load ZINC dataset,process the input information and the featurizer.

    Description:
        
        The data file contains a csv table, in which columns below are used:
            
            smiles:  SMILES representation of the molecular structure.
            
            zinc_id: the id of the compound

    Args:
        data_path(str): the path to the cached npz path.
        featurizer(pahelix.featurizers.Featurizer): the featurizer to use for 
            processing the data. If not none, The ``Featurizer.gen_features`` will be 
            applied to the raw data.
        return_smiles(bool): directly return the list of all smiles if True.
        indices(list): the indices of smiles to select.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_zinc_dataset('./zinc/raw')
            print(len(dataset))

    References:
    
    [1]Teague Sterling and John J. Irwin. Zinc 15 – ligand discovery for everyone. Journal of Chemical Information and Modeling, 55(11):2324–2337, 2015. doi: 10.1021/acs.jcim.5b00559. PMID: 26479676.

    """
    smiles_list = _load_zinc_dataset(data_path)
    if return_smiles:
        return smiles_list

    if not indices is None:
        smiles_list = [smiles_list[i] for i in indices]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data
        if not data is None:
            data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #13
0
def load_hiv_dataset(data_path, task_names=None):
    """Load hiv dataset,process the input information.

    Description:
        
        The data file contains a csv table, in which columns below are used:
            
            smiles: SMILES representation of the molecular structure
            
            activity: Three-class labels for screening results: CI/CM/CA.
            
            HIV_active: Binary labels for screening results: 1 (CA/CM) and 0 (CI)

    Args:
        data_path(str): the path to the cached npz path
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_hiv_dataset('./hiv')
            print(len(dataset))


    References:
    
    [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data

    """
    if task_names is None:
        task_names = get_default_hiv_task_names()

    raw_path = join(data_path, 'raw')
    csv_file = os.listdir(raw_path)[0]
    input_df = pd.read_csv(join(raw_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        data = {}
        data['smiles'] = smiles_list[i]
        data['label'] = labels.values[i]
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #14
0
def load_bace_dataset(data_path, task_names=None, featurizer=None):
    """load bace dataset ,process the classification labels and the input information.

    The data file contains a csv table, in which columns below are used:

    :mol: The smile representation of the molecular structure;
    :pIC50: The negative log of the IC50 binding affinity;
    :class: The binary labels for inhibitor.
    :Valid ratio: 1.0.
    :Task evaluated: 1/1 .

   
    Args:
        data_path(str): the path to the cached npz path.
        task_names: get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.  
    
    Returns:
        dataset(InMemoryDataset):the data_list(list of dict of numpy ndarray). 
    

    References:
    [1]Subramanian, Govindan, et al. “Computational modeling of β-secretase 1 (BACE-1) inhibitors using ligand based approaches.” Journal of chemical information and modeling 56.10 (2016): 1936-1949.

    """

    if task_names is None:
        task_names = get_default_bace_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['mol']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #15
0
def load_freesolv_dataset(data_path, task_names=None):
    """Load freesolv dataset,process the input information and the featurizer.
    
    Description:
        
        The data file contains a csv table, in which columns below are used:
            
            smiles: SMILES representation of the molecular structure
            
            Compound ID: Name of the compound
            
            measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label.
   
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_freesolv_dataset('./freesolv')
            print(len(dataset))

    References:
    
    [1] Mobley, David L., and J. Peter Guthrie. "FreeSolv: a database of experimental and calculated hydration free energies, with input files." Journal of computer-aided molecular design 28.7 (2014): 711-720.
    
    [2] https://github.com/MobleyLab/FreeSolv

    """
    if task_names is None:
        task_names = get_default_freesolv_task_names()

    raw_path = join(data_path, 'raw')
    csv_file = os.listdir(raw_path)[0]
    input_df = pd.read_csv(join(raw_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(labels)):
        data = {
            'smiles': smiles_list[i],
            'label': labels.values[i],
        }
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #16
0
def load_hiv_dataset(data_path, task_names=None, featurizer=None):
    """Load hiv dataset,process the input information and the featurizer.
    
   The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure
    :activity: Three-class labels for screening results: CI/CM/CA
    :HIV_active: Binary labels for screening results: 1 (CA/CM) and 0 (CI)
    :Valid ratio:1.0
    :Task evaluated:1/1

    Args:
        data_path(str): the path to the cached npz path.
        task_names:get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.    
   
    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).


    References:
    [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data

    """
    if task_names is None:
        task_names = get_default_hiv_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #17
0
def load_tox21_dataset(data_path, task_names=None, featurizer=None):
    """Load tox21 dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :NR-XXX: Nuclear receptor signaling bioassays results.
    :SR-XXX: Stress response bioassays results
    :Valid ratio: we get two ratio: 0.751、0.760
    :Task evaluated: 12/12

    Args:
        data_path(str): the path to the cached npz path.
        task_names: get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data. 
        

    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1]Tox21 Challenge. https://tripod.nih.gov/tox21/challenge/
    [2]please refer to the links at https://tripod.nih.gov/tox21/challenge/data.jsp for details.

    """
    if task_names is None:
        task_names = get_default_tox21_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1
    labels = labels.fillna(0)  # convert nan to 0

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
def load_sider_dataset(data_path, task_names=None, featurizer=None):
    """Load sider dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :Hepatobiliary disorders ~ Injury, poisoning and procedural complications:Recorded side effects for the drug
    :Valid ratio: 1.0
    :Task evaluated: 27/27

    Args:
        data_path(str): the path to the cached npz path.
        task_names: get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.    

    Returns:
       dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1]Kuhn, Michael, et al. “The SIDER database of drugs and side effects.” Nucleic acids research 44.D1 (2015): D1075-D1079.
    [2]Altae-Tran, Han, et al. “Low data drug discovery with one-shot learning.” ACS central science 3.4 (2017): 283-293.
    [3]Medical Dictionary for Regulatory Activities. http://www.meddra.org/
    [4]Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs.

    """
    if task_names is None:
        task_names = get_default_sider_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]        
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #19
0
def load_muv_dataset(data_path, task_names=None, featurizer=None):
    """Load muv dataset,process the input information and the featurizer.

    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure.
    :mol_id:  PubChem CID of the compound.
    :MUV-XXX: Measured results (Active/Inactive) for bioassays.
    :Valid ratio: we get two ratio: 0.155、0.160
    :Task evaluated: we get two values: 15/17、16/17

    Args:
        data_path(str): the path to the cached npz path.
        task_names:get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.       

    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1]Rohrer, Sebastian G., and Knut Baumann. “Maximum unbiased validation (MUV) data sets for virtual screening based on PubChem bioactivity data.” Journal of chemical information and modeling 49.2 (2009): 169-184.

    """
    if task_names is None:
        task_names = get_default_muv_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1
    labels = labels.fillna(0)   # convert nan to 0

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]        
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #20
0
def load_lipophilicity_dataset(data_path, task_names=None):
    """Load lipophilicity dataset,process the input information.
    
    Description:

        The data file contains a csv table, in which columns below are used:
            
            smiles: SMILES representation of the molecular structure
            
            exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label
    
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the csv file.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_lipophilicity_dataset('./lipophilicity')
            print(len(dataset))

    References:
    
    [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361

    """
    if task_names is None:
        task_names = get_default_lipophilicity_task_names()

    raw_path = join(data_path, 'raw')
    csv_file = os.listdir(raw_path)[0]
    input_df = pd.read_csv(join(raw_path, csv_file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(labels)):
        data = {
            'smiles': smiles_list[i],
            'label': labels.values[i],
        }
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #21
0
 def test_split(self):
     raw_data_list = [
         {
             'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
         },
         {
             'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
         },
         {
             'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
         },
         {
             'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
         },
         {
             'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1'
         },
         {
             'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1'
         },
         {
             'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1'
         },
         {
             'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1'
         },
         {
             'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1'
         },
         {
             'smiles': 'CCCCCCCCCCOCC(O)CN'
         },
         {
             'smiles': 'CCCCCCCCCCOCC(O)CN'
         },
         {
             'smiles': 'CCCCCCCCCCOCC(O)CN'
         },
         {
             'smiles': 'CCCCCCCCCCOCC(O)CN'
         },
     ]
     dataset = InMemoryDataset(raw_data_list)
     splitter = IndexSplitter()
     train_dataset, valid_dataset, test_dataset = splitter.split(
         dataset, frac_train=0.34, frac_valid=0.33, frac_test=0.33)
     n = len(train_dataset) + len(valid_dataset) + len(test_dataset)
     self.assertEqual(n, len(dataset))
Beispiel #22
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    logging.info('Load data ...')

    dataset = InMemoryDataset(npz_data_path=os.path.join(
        args.root, args.dataset, 'processed'))
    collate_fn = MoleculeCollateFunc(
        config['atom_names'],
        config['bond_names'],
        with_graph_label=False,
        with_pos_neg_mask=True)
    eval_collate_fn = MoleculeCollateFunc(
        config['atom_names'],
        config['bond_names'],
        with_graph_label=True,
        with_pos_neg_mask=False)

    logging.info("Data loaded.")
    logging.info("Train Examples: %s" % len(dataset))
    sys.stdout.flush()

    if args.emb_dir is not None:
        # pylint: disable=E1123
        os.makedirs(args.emb_dir, exist_ok=True)

    if args.model_dir is not None:
        # pylint: disable=E1123
        os.makedirs(args.model_dir, exist_ok=True)

    model = InfoGraph(config)
    criterion = InfoGraphCriterion(config)
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.lr,
        parameters=model.parameters())

    save_embedding(args, model, dataset, eval_collate_fn, -1)
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, model, criterion, optimizer,
                           dataset, collate_fn, epoch_id)
        logging.info('Epoch %d, train/loss: %f' % (epoch_id, train_loss))

        pdparams = os.path.join(args.model_dir, 'epoch_%d.pdparams' % epoch_id)
        paddle.save(model.state_dict(), pdparams)

        save_embedding(args, model, dataset, eval_collate_fn, epoch_id)
def load_freesolv_dataset(data_path, task_names=None, featurizer=None):
    """Load freesolv dataset,process the input information and the featurizer.
    
   The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure
    :Compound ID: Name of the compound
    :measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label.
   
    Args:
        data_path(str): the path to the cached npz path.
        task_names:get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.    

    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1] Mobley, David L., and J. Peter Guthrie. "FreeSolv: a database of experimental and calculated hydration free energies, with input files." Journal of computer-aided molecular design 28.7 (2014): 711-720.
    [2] https://github.com/MobleyLab/FreeSolv

    """
    if task_names is None:
        task_names = get_default_freesolv_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #24
0
def load_esol_dataset(data_path, task_names=None, featurizer=None):
    """load esol dataset ,process the classification labels and the input information.

    The data file contains a csv table, in which columns below are used:

    :smiles:SMILES representation of the molecular structure
    :Compound ID:Name of the compound
    :measured log solubility in mols per litre - Log-scale water solubility of the compound, used as label
   
   Args:
        data_path(str): the path to the cached npz path.
        task_names: get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.  
    
    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).
    
    References:
    [1] Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005.

    """
    if task_names is None:
        task_names = get_default_esol_task_names()

    # NB: some examples have multiple species
    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]        
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #25
0
def load_ppi_dataset(data_path, task_names=None, featurizer=None):
    """Load ppi dataset,process the input information and the featurizer.

    Description:

        The data file contains a txt file, in which columns below are used:
            
            protein1: protein1 name
            
            protein2: protein2 name
        
    Args:
        data_path(str): the path to the cached npz path.
        task_names(list): a list of header names to specify the columns to fetch from 
            the txt file.
    
    Returns:
        an InMemoryDataset instance.
    
    Example:
        .. code-block:: python

            dataset = load_ppi_dataset('./ppi/raw')
            print(len(dataset))

    """
    if task_names is None:
        task_names = get_default_ppi_task_names()

    txt_file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, txt_file), sep=' ')

    # there are no nans
    data_list = []
    for i in range(input_df.shape[0]):
        raw_data = {}
        raw_data['pair'] = input_df.loc[i,
                                        'protein1'], input_df.loc[i,
                                                                  'protein2']

        data = raw_data
        if not data is None:
            data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #26
0
def load_lipophilicity_dataset(data_path, task_names=None, featurizer=None):
    """Load lipophilicity dataset,process the input information and the featurizer.
    
    The data file contains a csv table, in which columns below are used:

    :smiles:  SMILES representation of the molecular structure
    :exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label
    
    Args:
        data_path(str): the path to the cached npz path.
        task_names:get the default lipophilicity task names.
        featurizer: the featurizer to use for processing the data.  
        
    Returns:
        dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray).

    References:
    [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361

    """
    if task_names is None:
        task_names = get_default_lipophilicity_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(smiles_list)):
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset
def load_ogbg_molpcba_dataset(data_path, task_names=None):
    """tbd"""
    if task_names is None:
        task_names = get_default_ogbg_molpcba_task_names(data_path)
    
    input_df = pd.read_csv(os.path.join(data_path, "mapping", "mol.csv.gz"), sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]
    labels = labels.replace(0, -1)  # convert 0 to -1
    labels = labels.fillna(0)   # convert nan to 0

    data_list = []
    for i in range(len(smiles_list)):
        data = {}
        data['smiles'] = smiles_list[i]        
        data['label'] = labels.values[i]
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #28
0
def load_qm7_dataset(data_path, task_names=None):
    """
    min/max/mean: -2192.0/-404.88/-1544.8360893118595 
    """
    if task_names is None:
        task_names = get_default_qm7_task_names()

    csv_file = join(data_path, 'raw/qm7.csv')
    input_df = pd.read_csv(csv_file, sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(labels)):
        data = {
            'smiles': smiles_list[i],
            'label': labels.values[i],
        }
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #29
0
def load_qm9_dataset(data_path, task_names=None):
    """
    tbd
    """
    if task_names is None:
        task_names = get_default_qm9_task_names()

    csv_file = join(data_path, 'raw/qm9.csv')
    input_df = pd.read_csv(csv_file, sep=',')
    smiles_list = input_df['smiles']
    labels = input_df[task_names]

    data_list = []
    for i in range(len(labels)):
        data = {
            'smiles': smiles_list[i],
            'label': labels.values[i],
        }
        data_list.append(data)
    dataset = InMemoryDataset(data_list)
    return dataset
Beispiel #30
0
def load_bbbp_dataset(data_path, task_names=None, featurizer=None):
    """tbd"""
    if task_names is None:
        task_names = get_default_bbbp_task_names()

    file = os.listdir(data_path)[0]
    input_df = pd.read_csv(join(data_path, file), sep=',')
    smiles_list = input_df['smiles']
    from rdkit.Chem import AllChem
    rdkit_mol_objs_list = [AllChem.MolFromSmiles(s) for s in smiles_list]
    preprocessed_rdkit_mol_objs_list = [
        m if not m is None else None for m in rdkit_mol_objs_list
    ]
    smiles_list = [
        AllChem.MolToSmiles(m) if not m is None else None
        for m in preprocessed_rdkit_mol_objs_list
    ]
    labels = input_df[task_names]
    # convert 0 to -1
    labels = labels.replace(0, -1)
    # there are no nans

    data_list = []
    for i in range(len(smiles_list)):
        if smiles_list[i] is None:
            continue
        raw_data = {}
        raw_data['smiles'] = smiles_list[i]
        raw_data['label'] = labels.values[i]

        if not featurizer is None:
            data = featurizer.gen_features(raw_data)
        else:
            data = raw_data

        if not data is None:
            data_list.append(data)

    dataset = InMemoryDataset(data_list)
    return dataset