Exemple #1
0
def test_split_and_save_data_frame(mocker, tmpdir, default_config):
    default_config["output_path"] = str(tmpdir)
    default_config["file_prefix"] = "dummy"
    filename_train = str(
        tmpdir / "dummy" + default_config["file_postfix"]["training_library"]
    )
    filename_valid = str(
        tmpdir / "dummy" + default_config["file_postfix"]["validation_library"]
    )
    filename_test = str(
        tmpdir / "dummy" + default_config["file_postfix"]["testing_library"]
    )
    data = pd.DataFrame.from_dict({"one": np.zeros(100), "two": np.ones(100)})

    split_and_save_data(data, "library", default_config)

    assert os.path.exists(filename_train)
    assert os.path.exists(filename_valid)
    assert os.path.exists(filename_test)

    data_read = pd.read_csv(filename_train, header=None, names=["one", "two"])
    assert len(data_read) == 90

    data_read = pd.read_csv(filename_valid, header=None, names=["one", "two"])
    assert len(data_read) == 5

    data_read = pd.read_csv(filename_test, header=None, names=["one", "two"])
    assert len(data_read) == 5
Exemple #2
0
def test_split_and_save_data_sparse(default_config, mocker, tmpdir):
    default_config["output_path"] = str(tmpdir)
    default_config["file_prefix"] = "dummy"
    filename_train = str(
        tmpdir / "dummy" + default_config["file_postfix"]["training_inputs"]
    )
    filename_valid = str(
        tmpdir / "dummy" + default_config["file_postfix"]["validation_inputs"]
    )
    filename_test = str(
        tmpdir / "dummy" + default_config["file_postfix"]["testing_inputs"]
    )
    data = sparse.csr_matrix(np.ones([100, 2]))

    split_and_save_data(data, "inputs", default_config)

    assert os.path.exists(filename_train)
    assert os.path.exists(filename_valid)
    assert os.path.exists(filename_test)

    data_read = sparse.load_npz(str(filename_train))
    assert data_read.shape[0] == 90

    data_read = sparse.load_npz(str(filename_valid))
    assert data_read.shape[0] == 5

    data_read = sparse.load_npz(str(filename_test))
    assert data_read.shape[0] == 5
Exemple #3
0
def test_split_and_save_data_ndarray(mocker, tmpdir, default_config):
    default_config["output_path"] = str(tmpdir)
    default_config["file_prefix"] = "dummy"
    filename_train = str(
        tmpdir / "dummy" + default_config["file_postfix"]["training_inputs"]
    )
    filename_valid = str(
        tmpdir / "dummy" + default_config["file_postfix"]["validation_inputs"]
    )
    filename_test = str(
        tmpdir / "dummy" + default_config["file_postfix"]["testing_inputs"]
    )
    data = np.ones([100, 2])

    split_and_save_data(data, "inputs", default_config)

    assert os.path.exists(filename_train)
    assert os.path.exists(filename_valid)
    assert os.path.exists(filename_test)

    data_read = np.load(filename_train)["arr_0"]
    assert len(data_read) == 90

    data_read = np.load(filename_valid)["arr_0"]
    assert len(data_read) == 5

    data_read = np.load(filename_test)["arr_0"]
    assert len(data_read) == 5
Exemple #4
0
def main():
    """ Entry-point for the preprocess_recommender tool
    """
    config = _get_config()

    filename = config.filename("library")
    dataset = pd.read_csv(
        filename,
        index_col=False,
        header=None,
        names=config["library_headers"],
    )

    print("Dataset loaded, generating Labels...", flush=True)
    lb = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)
    labels = lb.fit_transform(dataset["template_hash"])
    split_and_save_data(labels, "labels", config)

    print("Labels created and splitted, generating Inputs...", flush=True)
    reactants = dataset["reactants"].to_numpy()
    inputs = np.apply_along_axis(reactants_to_fingerprint, 0, [reactants],
                                 config)
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs", config)

    print("Inputs created and splitted, splitting Full Dataset...", flush=True)
    split_and_save_data(dataset, "library", config)

    print("Full Dataset splitted, creating unique template set", flush=True)
    _save_unique_templates(dataset, config)
def main() -> None:
    """Entry-point for the preprocess_expansion tool"""
    config = _get_config()
    if config["library_headers"][-1] != "template_code":
        config["library_headers"].append("template_code")

    filename = config.filename("library")
    if not os.path.exists(filename):
        dataset = _filter_dataset(config)
    else:
        dataset = pd.read_csv(
            filename,
            index_col=False,
            header=None,
            names=config["library_headers"],
        )

    print("Dataset filtered/loaded, generating labels...", flush=True)
    labelb = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)
    labels = labelb.fit_transform(dataset["template_hash"])
    split_and_save_data(labels, "labels", config)

    print("Labels created and split, generating inputs...", flush=True)
    products = dataset["products"].to_numpy()
    inputs = np.apply_along_axis(smiles_to_fingerprint, 0, [products], config)
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs", config)

    print("Inputs created and split, splitting full Dataset...", flush=True)
    split_and_save_data(dataset, "library", config)

    print("Full Dataset split, creating unique template set", flush=True)
    _save_unique_templates(dataset, config)
def main() -> None:
    """Entry-point for the preprocess_filter tool"""
    config = _get_config()

    true_dataset = pd.read_csv(
        config.filename("library"),
        index_col=False,
        header=None,
        names=config["library_headers"][:-1],
    )
    true_dataset["true_product"] = 1
    false_dataset = pd.read_csv(
        config.filename("false_library"),
        index_col=False,
        header=None,
        names=config["library_headers"][:-1],
    )
    false_dataset["true_product"] = 0
    dataset = true_dataset.append(false_dataset, sort=False)

    print("Dataset loaded, generating Labels...", flush=True)
    labels = dataset["true_product"].to_numpy()
    split_and_save_data(labels, "labels", config)

    print("Labels created and split, generating Inputs...", flush=True)
    products = dataset["products"].to_numpy()
    reactants = dataset["reactants"].to_numpy()
    inputs = np.apply_along_axis(
        reaction_to_fingerprints, 0, [products, reactants], config
    ).astype(np.int8)
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs2", config)

    inputs = np.apply_along_axis(smiles_to_fingerprint, 0, [products], config).astype(
        np.int8
    )
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs", config)

    print("Inputs created and split, splitting Full Dataset...", flush=True)
    split_and_save_data(dataset, "library", config)