def load_esol_dataset(data_path, task_names=None, featurizer=None): """tbd""" if task_names is None: task_names = get_default_esol_task_names() # NB: some examples have multiple species file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_esol_dataset(data_path, task_names=None, featurizer=None): """Load esol dataset ,process the classification labels and the input information. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure Compound ID: Name of the compound measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_esol_dataset('./esol/raw') print(len(dataset)) References: [1] Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005. """ if task_names is None: task_names = get_default_esol_task_names() # NB: some examples have multiple species csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_zinc_dataset(data_path, featurizer=None): """Load zinc dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :zinc_id: the id of the compound Args: data_path(str): the path to the cached npz path. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1]Teague Sterling and John J. Irwin. Zinc 15 – ligand discovery for everyone. Journal of Chemical Information and Modeling, 55(11):2324–2337, 2015. doi: 10.1021/acs.jcim.5b00559. PMID: 26479676. """ smiles_list = _load_zinc_dataset(data_path) data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_bace_dataset(data_path, task_names=None, featurizer=None): """tbd""" if task_names is None: task_names = get_default_bace_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['mol'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_hiv_dataset(data_path, task_names=None, featurizer=None): """Load hiv dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles:SMILES representation of the molecular structure :activity:Three-class labels for screening results: CI/CM/CA :HIV_active:Binary labels for screening results: 1(CA/CM) and 0CI) Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_hiv_dataset('./hiv/raw') print(len(dataset)) References: [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data """ if task_names is None: task_names = get_default_hiv_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_sider_dataset(data_path, task_names=None, featurizer=None): """Load sider dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :Hepatobiliary disorders ~ Injury, poisoning and procedural complications:Recorded side effects for the drug Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_sider_dataset('./sider/raw') print(len(dataset)) References: [1]Kuhn, Michael, et al. “The SIDER database of drugs and side effects.” Nucleic acids research 44.D1 (2015): D1075-D1079. [2]Altae-Tran, Han, et al. “Low data drug discovery with one-shot learning.” ACS central science 3.4 (2017): 283-293. [3]Medical Dictionary for Regulatory Activities. http://www.meddra.org/ [4]Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs. """ if task_names is None: task_names = get_default_sider_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_tox21_dataset(data_path, task_names=None, featurizer=None): """Load tox21 dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :NR-XXX: Nuclear receptor signaling bioassays results. :SR-XXX: Stress response bioassays results Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_tox21_dataset('./tox21/raw') print(len(dataset)) References: [1]Tox21 Challenge. https://tripod.nih.gov/tox21/challenge/ [2]please refer to the links at https://tripod.nih.gov/tox21/challenge/data.jsp for details. """ if task_names is None: task_names = get_default_tox21_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 labels = labels.fillna(0) # convert nan to 0 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_bace_dataset(data_path, task_names=None, featurizer=None): """load bace dataset ,process the classification labels and the input information. The data file contains a csv table, in which columns below are used: :mol: The smile representation of the molecular structure; :pIC50: The negative log of the IC50 binding affinity; :class: The binary labels for inhibitor. Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_bace_dataset('./bace/raw') print(len(dataset)) References: [1]Subramanian, Govindan, et al. “Computational modeling of β-secretase 1 (BACE-1) inhibitors using ligand based approaches.” Journal of chemical information and modeling 56.10 (2016): 1936-1949. """ if task_names is None: task_names = get_default_bace_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['mol'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_lipophilicity_dataset(data_path, task_names=None, featurizer=None): """Load lipophilicity dataset,process the input information and the featurizer. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_lipophilicity_dataset('./lipophilicity/raw') print(len(dataset)) References: [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361 """ if task_names is None: task_names = get_default_lipophilicity_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_muv_dataset(data_path, task_names=None, featurizer=None): """Load muv dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :mol_id: PubChem CID of the compound. :MUV-XXX: Measured results (Active/Inactive) for bioassays. Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_muv_dataset('./muv/raw') print(len(dataset)) References: [1]Rohrer, Sebastian G., and Knut Baumann. “Maximum unbiased validation (MUV) data sets for virtual screening based on PubChem bioactivity data.” Journal of chemical information and modeling 49.2 (2009): 169-184. """ if task_names is None: task_names = get_default_muv_task_names() csv_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 labels = labels.fillna(0) # convert nan to 0 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_smiles_to_dataset(data_path): """tbd""" files = sorted(glob('%s/*' % data_path)) data_list = [] for file in files: with open(file, 'r') as f: tmp_data_list = [line.strip() for line in f.readlines()] data_list.extend(tmp_data_list) dataset = InMemoryDataset(data_list=data_list) return dataset
def load_zinc_dataset(data_path, featurizer=None, return_smiles=False, indices=None): """Load ZINC dataset,process the input information and the featurizer. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure. zinc_id: the id of the compound Args: data_path(str): the path to the cached npz path. featurizer(pahelix.featurizers.Featurizer): the featurizer to use for processing the data. If not none, The ``Featurizer.gen_features`` will be applied to the raw data. return_smiles(bool): directly return the list of all smiles if True. indices(list): the indices of smiles to select. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_zinc_dataset('./zinc/raw') print(len(dataset)) References: [1]Teague Sterling and John J. Irwin. Zinc 15 – ligand discovery for everyone. Journal of Chemical Information and Modeling, 55(11):2324–2337, 2015. doi: 10.1021/acs.jcim.5b00559. PMID: 26479676. """ smiles_list = _load_zinc_dataset(data_path) if return_smiles: return smiles_list if not indices is None: smiles_list = [smiles_list[i] for i in indices] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_hiv_dataset(data_path, task_names=None): """Load hiv dataset,process the input information. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure activity: Three-class labels for screening results: CI/CM/CA. HIV_active: Binary labels for screening results: 1 (CA/CM) and 0 (CI) Args: data_path(str): the path to the cached npz path task_names(list): a list of header names to specify the columns to fetch from the csv file. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_hiv_dataset('./hiv') print(len(dataset)) References: [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data """ if task_names is None: task_names = get_default_hiv_task_names() raw_path = join(data_path, 'raw') csv_file = os.listdir(raw_path)[0] input_df = pd.read_csv(join(raw_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): data = {} data['smiles'] = smiles_list[i] data['label'] = labels.values[i] data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_bace_dataset(data_path, task_names=None, featurizer=None): """load bace dataset ,process the classification labels and the input information. The data file contains a csv table, in which columns below are used: :mol: The smile representation of the molecular structure; :pIC50: The negative log of the IC50 binding affinity; :class: The binary labels for inhibitor. :Valid ratio: 1.0. :Task evaluated: 1/1 . Args: data_path(str): the path to the cached npz path. task_names: get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset):the data_list(list of dict of numpy ndarray). References: [1]Subramanian, Govindan, et al. “Computational modeling of β-secretase 1 (BACE-1) inhibitors using ligand based approaches.” Journal of chemical information and modeling 56.10 (2016): 1936-1949. """ if task_names is None: task_names = get_default_bace_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['mol'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_freesolv_dataset(data_path, task_names=None): """Load freesolv dataset,process the input information and the featurizer. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure Compound ID: Name of the compound measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label. Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_freesolv_dataset('./freesolv') print(len(dataset)) References: [1] Mobley, David L., and J. Peter Guthrie. "FreeSolv: a database of experimental and calculated hydration free energies, with input files." Journal of computer-aided molecular design 28.7 (2014): 711-720. [2] https://github.com/MobleyLab/FreeSolv """ if task_names is None: task_names = get_default_freesolv_task_names() raw_path = join(data_path, 'raw') csv_file = os.listdir(raw_path)[0] input_df = pd.read_csv(join(raw_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(labels)): data = { 'smiles': smiles_list[i], 'label': labels.values[i], } data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_hiv_dataset(data_path, task_names=None, featurizer=None): """Load hiv dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure :activity: Three-class labels for screening results: CI/CM/CA :HIV_active: Binary labels for screening results: 1 (CA/CM) and 0 (CI) :Valid ratio:1.0 :Task evaluated:1/1 Args: data_path(str): the path to the cached npz path. task_names:get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1] AIDS Antiviral Screen Data. https://wiki.nci.nih.gov/display/NCIDTPdata/AIDS+Antiviral+Screen+Data """ if task_names is None: task_names = get_default_hiv_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_tox21_dataset(data_path, task_names=None, featurizer=None): """Load tox21 dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :NR-XXX: Nuclear receptor signaling bioassays results. :SR-XXX: Stress response bioassays results :Valid ratio: we get two ratio: 0.751、0.760 :Task evaluated: 12/12 Args: data_path(str): the path to the cached npz path. task_names: get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1]Tox21 Challenge. https://tripod.nih.gov/tox21/challenge/ [2]please refer to the links at https://tripod.nih.gov/tox21/challenge/data.jsp for details. """ if task_names is None: task_names = get_default_tox21_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 labels = labels.fillna(0) # convert nan to 0 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_sider_dataset(data_path, task_names=None, featurizer=None): """Load sider dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :Hepatobiliary disorders ~ Injury, poisoning and procedural complications:Recorded side effects for the drug :Valid ratio: 1.0 :Task evaluated: 27/27 Args: data_path(str): the path to the cached npz path. task_names: get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1]Kuhn, Michael, et al. “The SIDER database of drugs and side effects.” Nucleic acids research 44.D1 (2015): D1075-D1079. [2]Altae-Tran, Han, et al. “Low data drug discovery with one-shot learning.” ACS central science 3.4 (2017): 283-293. [3]Medical Dictionary for Regulatory Activities. http://www.meddra.org/ [4]Please refer to http://sideeffects.embl.de/se/?page=98 for details on ADRs. """ if task_names is None: task_names = get_default_sider_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_muv_dataset(data_path, task_names=None, featurizer=None): """Load muv dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure. :mol_id: PubChem CID of the compound. :MUV-XXX: Measured results (Active/Inactive) for bioassays. :Valid ratio: we get two ratio: 0.155、0.160 :Task evaluated: we get two values: 15/17、16/17 Args: data_path(str): the path to the cached npz path. task_names:get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1]Rohrer, Sebastian G., and Knut Baumann. “Maximum unbiased validation (MUV) data sets for virtual screening based on PubChem bioactivity data.” Journal of chemical information and modeling 49.2 (2009): 169-184. """ if task_names is None: task_names = get_default_muv_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 labels = labels.fillna(0) # convert nan to 0 data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_lipophilicity_dataset(data_path, task_names=None): """Load lipophilicity dataset,process the input information. Description: The data file contains a csv table, in which columns below are used: smiles: SMILES representation of the molecular structure exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the csv file. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_lipophilicity_dataset('./lipophilicity') print(len(dataset)) References: [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361 """ if task_names is None: task_names = get_default_lipophilicity_task_names() raw_path = join(data_path, 'raw') csv_file = os.listdir(raw_path)[0] input_df = pd.read_csv(join(raw_path, csv_file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(labels)): data = { 'smiles': smiles_list[i], 'label': labels.values[i], } data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def test_split(self): raw_data_list = [ { 'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' }, { 'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' }, { 'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' }, { 'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1' }, { 'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1' }, { 'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1' }, { 'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1' }, { 'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1' }, { 'smiles': 'CC(C)CCCCCCCOP(OCCCCCCCC(C)C)Oc1ccccc1' }, { 'smiles': 'CCCCCCCCCCOCC(O)CN' }, { 'smiles': 'CCCCCCCCCCOCC(O)CN' }, { 'smiles': 'CCCCCCCCCCOCC(O)CN' }, { 'smiles': 'CCCCCCCCCCOCC(O)CN' }, ] dataset = InMemoryDataset(raw_data_list) splitter = IndexSplitter() train_dataset, valid_dataset, test_dataset = splitter.split( dataset, frac_train=0.34, frac_valid=0.33, frac_test=0.33) n = len(train_dataset) + len(valid_dataset) + len(test_dataset) self.assertEqual(n, len(dataset))
def main(args): with open(args.config, 'r') as f: config = json.load(f) logging.info('Load data ...') dataset = InMemoryDataset(npz_data_path=os.path.join( args.root, args.dataset, 'processed')) collate_fn = MoleculeCollateFunc( config['atom_names'], config['bond_names'], with_graph_label=False, with_pos_neg_mask=True) eval_collate_fn = MoleculeCollateFunc( config['atom_names'], config['bond_names'], with_graph_label=True, with_pos_neg_mask=False) logging.info("Data loaded.") logging.info("Train Examples: %s" % len(dataset)) sys.stdout.flush() if args.emb_dir is not None: # pylint: disable=E1123 os.makedirs(args.emb_dir, exist_ok=True) if args.model_dir is not None: # pylint: disable=E1123 os.makedirs(args.model_dir, exist_ok=True) model = InfoGraph(config) criterion = InfoGraphCriterion(config) optimizer = paddle.optimizer.Adam( learning_rate=args.lr, parameters=model.parameters()) save_embedding(args, model, dataset, eval_collate_fn, -1) for epoch_id in range(args.max_epoch): train_loss = train(args, model, criterion, optimizer, dataset, collate_fn, epoch_id) logging.info('Epoch %d, train/loss: %f' % (epoch_id, train_loss)) pdparams = os.path.join(args.model_dir, 'epoch_%d.pdparams' % epoch_id) paddle.save(model.state_dict(), pdparams) save_embedding(args, model, dataset, eval_collate_fn, epoch_id)
def load_freesolv_dataset(data_path, task_names=None, featurizer=None): """Load freesolv dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure :Compound ID: Name of the compound :measured log solubility in mols per litre: Log-scale water solubility of the compound, used as label. Args: data_path(str): the path to the cached npz path. task_names:get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1] Mobley, David L., and J. Peter Guthrie. "FreeSolv: a database of experimental and calculated hydration free energies, with input files." Journal of computer-aided molecular design 28.7 (2014): 711-720. [2] https://github.com/MobleyLab/FreeSolv """ if task_names is None: task_names = get_default_freesolv_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_esol_dataset(data_path, task_names=None, featurizer=None): """load esol dataset ,process the classification labels and the input information. The data file contains a csv table, in which columns below are used: :smiles:SMILES representation of the molecular structure :Compound ID:Name of the compound :measured log solubility in mols per litre - Log-scale water solubility of the compound, used as label Args: data_path(str): the path to the cached npz path. task_names: get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1] Delaney, John S. "ESOL: estimating aqueous solubility directly from molecular structure." Journal of chemical information and computer sciences 44.3 (2004): 1000-1005. """ if task_names is None: task_names = get_default_esol_task_names() # NB: some examples have multiple species file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_ppi_dataset(data_path, task_names=None, featurizer=None): """Load ppi dataset,process the input information and the featurizer. Description: The data file contains a txt file, in which columns below are used: protein1: protein1 name protein2: protein2 name Args: data_path(str): the path to the cached npz path. task_names(list): a list of header names to specify the columns to fetch from the txt file. Returns: an InMemoryDataset instance. Example: .. code-block:: python dataset = load_ppi_dataset('./ppi/raw') print(len(dataset)) """ if task_names is None: task_names = get_default_ppi_task_names() txt_file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, txt_file), sep=' ') # there are no nans data_list = [] for i in range(input_df.shape[0]): raw_data = {} raw_data['pair'] = input_df.loc[i, 'protein1'], input_df.loc[i, 'protein2'] data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_lipophilicity_dataset(data_path, task_names=None, featurizer=None): """Load lipophilicity dataset,process the input information and the featurizer. The data file contains a csv table, in which columns below are used: :smiles: SMILES representation of the molecular structure :exp: Measured octanol/water distribution coefficient (logD) of the compound, used as label Args: data_path(str): the path to the cached npz path. task_names:get the default lipophilicity task names. featurizer: the featurizer to use for processing the data. Returns: dataset(InMemoryDataset): the data_list(list of dict of numpy ndarray). References: [1]Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361 """ if task_names is None: task_names = get_default_lipophilicity_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(smiles_list)): raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_ogbg_molpcba_dataset(data_path, task_names=None): """tbd""" if task_names is None: task_names = get_default_ogbg_molpcba_task_names(data_path) input_df = pd.read_csv(os.path.join(data_path, "mapping", "mol.csv.gz"), sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] labels = labels.replace(0, -1) # convert 0 to -1 labels = labels.fillna(0) # convert nan to 0 data_list = [] for i in range(len(smiles_list)): data = {} data['smiles'] = smiles_list[i] data['label'] = labels.values[i] data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_qm7_dataset(data_path, task_names=None): """ min/max/mean: -2192.0/-404.88/-1544.8360893118595 """ if task_names is None: task_names = get_default_qm7_task_names() csv_file = join(data_path, 'raw/qm7.csv') input_df = pd.read_csv(csv_file, sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(labels)): data = { 'smiles': smiles_list[i], 'label': labels.values[i], } data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_qm9_dataset(data_path, task_names=None): """ tbd """ if task_names is None: task_names = get_default_qm9_task_names() csv_file = join(data_path, 'raw/qm9.csv') input_df = pd.read_csv(csv_file, sep=',') smiles_list = input_df['smiles'] labels = input_df[task_names] data_list = [] for i in range(len(labels)): data = { 'smiles': smiles_list[i], 'label': labels.values[i], } data_list.append(data) dataset = InMemoryDataset(data_list) return dataset
def load_bbbp_dataset(data_path, task_names=None, featurizer=None): """tbd""" if task_names is None: task_names = get_default_bbbp_task_names() file = os.listdir(data_path)[0] input_df = pd.read_csv(join(data_path, file), sep=',') smiles_list = input_df['smiles'] from rdkit.Chem import AllChem rdkit_mol_objs_list = [AllChem.MolFromSmiles(s) for s in smiles_list] preprocessed_rdkit_mol_objs_list = [ m if not m is None else None for m in rdkit_mol_objs_list ] smiles_list = [ AllChem.MolToSmiles(m) if not m is None else None for m in preprocessed_rdkit_mol_objs_list ] labels = input_df[task_names] # convert 0 to -1 labels = labels.replace(0, -1) # there are no nans data_list = [] for i in range(len(smiles_list)): if smiles_list[i] is None: continue raw_data = {} raw_data['smiles'] = smiles_list[i] raw_data['label'] = labels.values[i] if not featurizer is None: data = featurizer.gen_features(raw_data) else: data = raw_data if not data is None: data_list.append(data) dataset = InMemoryDataset(data_list) return dataset