def __init__(self, config: Config, dataset_label: str) -> None:
     self.batch_size = config["batch_size"]
     input_filename = config.filename(dataset_label + "_inputs")
     label_filename = config.filename(dataset_label + "_labels")
     self.input_matrix = self._load_data(input_filename)
     self.label_matrix = self._load_data(label_filename)
     self.input_dim = self.input_matrix.shape[1]
def _save_unique_templates(dataset: pd.DataFrame, config: Config) -> None:
    template_group = dataset.groupby("template_hash", sort=False).size()
    dataset = dataset[["retro_template", "template_code"] +
                      config["metadata_headers"]]
    if "classification" in dataset.columns:
        dataset["classification"].fillna("-", inplace=True)
    dataset = dataset.drop_duplicates(subset="template_code", keep="first")
    dataset["library_occurence"] = template_group.values
    dataset.set_index("template_code", inplace=True)
    dataset = dataset.sort_index()
    dataset.to_hdf(config.filename("unique_templates"), "table")
def _filter_dataset(config: Config) -> pd.DataFrame:

    filename = config.filename("raw_library")
    if not os.path.exists(filename):
        raise FileNotFoundError(
            f"The file {filename} is missing - cannot proceed without the full template library."
        )

    # Skipping the last header as it is not available in the raw data
    full_data = pd.read_csv(
        filename,
        index_col=False,
        header=None,
        names=config["library_headers"][:-1],
    )

    if config["remove_unsanitizable_products"]:
        products = full_data["products"].to_numpy()
        idx = np.apply_along_axis(is_sanitizable, 0, [products])
        full_data = full_data[idx]

    full_data = full_data.drop_duplicates(subset="reaction_hash")
    template_group = full_data.groupby("template_hash")
    template_group = template_group.size().sort_values(ascending=False)
    min_index = template_group[
        template_group >= config["template_occurrence"]].index
    dataset = full_data[full_data["template_hash"].isin(min_index)]

    template_labels = LabelEncoder()
    dataset = dataset.assign(
        template_code=template_labels.fit_transform(dataset["template_hash"]))
    dataset.to_csv(
        config.filename("library"),
        mode="w",
        header=False,
        index=False,
    )
    return dataset
def _setup_callbacks(config: Config) -> List[Any]:
    early_stopping = EarlyStopping(monitor="val_loss", patience=10)
    csv_logger = CSVLogger(config.filename("_keras_training.log"), append=True)

    checkpoint_path = os.path.join(config["output_path"], "checkpoints")
    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)
    checkpoint = ModelCheckpoint(
        os.path.join(checkpoint_path, "keras_model.hdf5"),
        monitor="loss",
        save_best_only=True,
    )

    reduce_lr = ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=5,
        verbose=0,
        mode="auto",
        min_delta=0.000001,
        cooldown=0,
        min_lr=0,
    )
    return [early_stopping, csv_logger, checkpoint, reduce_lr]
 def __init__(self, config: Config, dataset_label: str) -> None:
     super().__init__(config, dataset_label)
     filename = config.filename(dataset_label + "_inputs2")
     self.input_matrix2 = self._load_data(filename)
Beispiel #6
0
def _save_unique_templates(dataset: pd.DataFrame, config: Config) -> None:
    dataset = dataset[["retro_template", "template_code"]]
    dataset = dataset.drop_duplicates(subset="template_code", keep="first")
    dataset.set_index("template_code", inplace=True)
    dataset = dataset.sort_index()
    dataset.to_hdf(config.filename("unique_templates"), "table")