def __init__(self, config: Config, dataset_label: str) -> None: self.batch_size = config["batch_size"] input_filename = config.filename(dataset_label + "_inputs") label_filename = config.filename(dataset_label + "_labels") self.input_matrix = self._load_data(input_filename) self.label_matrix = self._load_data(label_filename) self.input_dim = self.input_matrix.shape[1]
def _save_unique_templates(dataset: pd.DataFrame, config: Config) -> None: template_group = dataset.groupby("template_hash", sort=False).size() dataset = dataset[["retro_template", "template_code"] + config["metadata_headers"]] if "classification" in dataset.columns: dataset["classification"].fillna("-", inplace=True) dataset = dataset.drop_duplicates(subset="template_code", keep="first") dataset["library_occurence"] = template_group.values dataset.set_index("template_code", inplace=True) dataset = dataset.sort_index() dataset.to_hdf(config.filename("unique_templates"), "table")
def _filter_dataset(config: Config) -> pd.DataFrame: filename = config.filename("raw_library") if not os.path.exists(filename): raise FileNotFoundError( f"The file {filename} is missing - cannot proceed without the full template library." ) # Skipping the last header as it is not available in the raw data full_data = pd.read_csv( filename, index_col=False, header=None, names=config["library_headers"][:-1], ) if config["remove_unsanitizable_products"]: products = full_data["products"].to_numpy() idx = np.apply_along_axis(is_sanitizable, 0, [products]) full_data = full_data[idx] full_data = full_data.drop_duplicates(subset="reaction_hash") template_group = full_data.groupby("template_hash") template_group = template_group.size().sort_values(ascending=False) min_index = template_group[ template_group >= config["template_occurrence"]].index dataset = full_data[full_data["template_hash"].isin(min_index)] template_labels = LabelEncoder() dataset = dataset.assign( template_code=template_labels.fit_transform(dataset["template_hash"])) dataset.to_csv( config.filename("library"), mode="w", header=False, index=False, ) return dataset
def _setup_callbacks(config: Config) -> List[Any]: early_stopping = EarlyStopping(monitor="val_loss", patience=10) csv_logger = CSVLogger(config.filename("_keras_training.log"), append=True) checkpoint_path = os.path.join(config["output_path"], "checkpoints") if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) checkpoint = ModelCheckpoint( os.path.join(checkpoint_path, "keras_model.hdf5"), monitor="loss", save_best_only=True, ) reduce_lr = ReduceLROnPlateau( monitor="val_loss", factor=0.5, patience=5, verbose=0, mode="auto", min_delta=0.000001, cooldown=0, min_lr=0, ) return [early_stopping, csv_logger, checkpoint, reduce_lr]
def __init__(self, config: Config, dataset_label: str) -> None: super().__init__(config, dataset_label) filename = config.filename(dataset_label + "_inputs2") self.input_matrix2 = self._load_data(filename)
def _save_unique_templates(dataset: pd.DataFrame, config: Config) -> None: dataset = dataset[["retro_template", "template_code"]] dataset = dataset.drop_duplicates(subset="template_code", keep="first") dataset.set_index("template_code", inplace=True) dataset = dataset.sort_index() dataset.to_hdf(config.filename("unique_templates"), "table")