def copy_vectorizer(dataset_name: str, dest_model_name: str) -> bool: """Copies the vectorizer from a given dataset to destination model Parameters ---------- dataset_name : str Qualified dataset name dest_model_name : str Qualified model name Returns ------- bool True if the copy succeeded, False if the dataset doesn't have a vectorizer or its vectorizer is unavailable """ config = load_config() data_home = config["model"]["data_home"] model_home = config["model"]["model_home"] src = f"{data_home}/{dataset_name}/vectorizer.pkl" dest = f"{model_home}\\{dest_model_name}\\vectorizer.pkl" try: shutil.copyfile(src, dest) return True except FileNotFoundError: print(f"Dataset {dataset_name} has no defined vectorizer. It will not be usable for searching") return False
def run(self): self.status.emit(f"Loading model '{self.model_name}'") # Load model and vectorizer model, vectorizer = vdsh.utility.load_model(self.model_name) # Reset the mode before refitting if it has already been fit if model.meta.is_fit: model_home = load_config()["model"]["model_home"] source = f"{model_home}/{self.model_name}" dest = f"{model_home}/{self.model_name}__swap" os.rename(source, dest) os.mkdir(source) model.meta.info["fit"] = False model.meta.info["fit_dataset"] = "" model.meta.info["fit_time"] = "" model.meta.dump(source) shutil.rmtree(dest) model, vectorizer = vdsh.utility.load_model(self.model_name) self.status.emit( f"Model loaded. Extracting train from '{self.dataset_name}' dataset" ) # Extract train dataset X = extract_train(self.dataset_name) self.status.emit("Compiling the model...") lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( self.initial_rate, decay_steps=self.decay_steps, decay_rate=self.decay_rate, staircase=True) if self.optimizer == "adam": opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule) else: raise NotImplementedError("No such optimizer") model.compile(optimizer=opt) self.status.emit(TRAINING_START_MSG) model.fit(X, epochs=self.epochs, batch_size=self.batch_size, callbacks=[self.progbar_callback]) # Flag model fit model.meta.flag_fit(self.dataset_name) # Fitted model is saved, and marked fit, it cannot be fit once more without copying vdsh.utility.dump_model(model) self.status.emit(MODEL_SAVED_MSG) self.finished.emit()
def scan_datasets() -> list[DatasetMetaInfo]: """Returns the list of DatasetMetaInfo for dirs in data_home, if there is no meta.json provided name and path is inferred""" data_home = usersetup.load_config()["model"]["data_home"] result: list[DatasetMetaInfo] = list() # Get the list of paths and dataset names, skip the first dir (the parent dir) datasets = os.walk(data_home) next(datasets) for d in datasets: path, _, files = d name = path.split("\\")[-1] if META_FILE_NAME in files and DATA_FILE_NAME in files: mi = DatasetMetaInfo.from_file(path) elif DATA_FILE_NAME in files: mi = DatasetMetaInfo(name, np.NAN, np.NAN, np.NAN) else: mi = DatasetMetaInfo.undefined_preset(name) result.append(mi) return result
def check_model_available(name: str) -> Optional[ModelMetaInfo]: """Returns model meta info if it exists, else None""" model_home = load_config()["model"]["model_home"] dest = f"{model_home}/{name}" try: return ModelMetaInfo.from_file(dest) except OSError: return None
def check_dataset_available(name: str) -> Optional[DatasetMetaInfo]: """Returns dataset meta info if it exists, else None""" data_home = load_config()["model"]["data_home"] dest = f"{data_home}/{name}" try: return DatasetMetaInfo.from_file(dest) except OSError: return None
def run(self): self.status.emit(f"Loading model '{self._model_name}'...") self.progress.emit(PROGRESS_LOADING) model, _ = vdsh.utility.load_model(self._model_name) self.status.emit(f"Loading data...") self.progress.emit(PROGRESS_LOADING_MODEL) data_home = load_config()["model"]["data_home"] try: with h5py.File(f"{data_home}/{self._dataset_name}/data.hdf5", "r") as hf: train: np.ndarray = hf["train"][:] train_targets: np.ndarray = hf["train_labels"][:] test: np.ndarray = hf["test"][:] test_targets: np.ndarray = hf["test_labels"][:] self.status.emit(f"Running predict...") self.progress.emit(PROGRESS_IO_COMPLETE) train_pred = model.predict(train) test_pred = model.predict(test) self.status.emit(f"Transforming to binary codes") self.progress.emit(PROGRESS_AFTER_PREDICT) train_codes = medhash_transform(train_pred) test_codes = medhash_transform(test_pred) current_progress = PROGRESS_AFTER_PREDICT end_progress = TARGET_PROGRESS steps = len(test_codes) progress_per_step = (end_progress - current_progress) / steps self.status.emit(f"Running metrics tests...") precision_scores = [] for idx, tc in enumerate(test_codes): r = precision(test_targets[idx], train_targets, top_k_indices(tc, train_codes, self._k)[0], self._k) precision_scores.append(r) current_progress += progress_per_step self.progress.emit(math.floor(current_progress)) mean_precision = np.array(precision_scores).mean() self.precisionResult.emit(mean_precision) self.progress.emit(FINISHED_PROGRESS) self.finished.emit() self.status.emit("Finished") except (IOError, OSError): print(f"Cannot reach data.hdf5 in {self._dataset_name}") self.progress.emit(0) self.finished.emit() self.status.emit("Failed to read data")
def create_20ng(vocab_size: int, name: str = "20ng"): """Fetches 20ng dataset in plaintext thanks to sklearn, then uses custom Tfidf vectorizer to encode the dataset according to specified vocab_size Parameters ---------- vocab_size : int Target vocabulary size of the dataset name : str Output name of the dataset, default '20ng' Returns ------- None """ try: data_home = load_config()["model"]["data_home"] dest = f"{data_home}/{name}" try: os.mkdir(f"{data_home}/{name}") except FileExistsError: pass print("Fetching 20ng...") train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes")) test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes")) print("Vectorizing...") v = TfidfVectorizer(stop_words="english", max_features=vocab_size) # Scipy sparse matrices sparse_train_tfidf: scipy.sparse.csr.csr_matrix = v.fit_transform(train.data) sparse_test_tfidf = v.transform(test.data) print("Saving dataset...") with h5py.File(f"{dest}/data.hdf5", "w") as hf: hf.create_dataset(name="train", data=sparse_train_tfidf.toarray(), compression="gzip") hf.create_dataset(name="train_labels", data=train.target) hf.create_dataset(name="test", data=sparse_test_tfidf.toarray(), compression="gzip") hf.create_dataset(name="test_labels", data=test.target) mi = DatasetMetaInfo(name, vocab_size, num_train=sparse_train_tfidf.shape[0], num_test=sparse_test_tfidf.shape[0], num_labels=1) mi.dump(dest) except (KeyError, IOError): print("Couldn't read config.json file")
def load_model(model_name: str) -> tuple[VDSH, DocumentVectorizer]: """Loads the model from model_home/model_name If the model itself exists it is returned itself, compiled and ready to use If the model doesn't exist but there is meta info file, then the model is created ad hoc and returned as well Parameters ---------- model_name : str A qualified model name Returns ------- tuple[VDSH, DocumentVectorizer] Retrieved model and the vectorizer if present, else None """ config = load_config() model_home = config["model"]["model_home"] mi = ModelMetaInfo.from_file(f"{model_home}/{model_name}") try: model = tf.keras.models.load_model(f"{model_home}/{model_name}") except OSError: print("Model not found. Creating model...") model = create_vdsh(mi.vocab_size, mi.hidden_dim, mi.latent_dim, mi.kl_step, mi.dropout_prob, mi.name) # Push meta info to model model.meta = mi try: vec = storage.load_vectorizer(f"{model_home}/{model_name}") except (FileNotFoundError, IOError): print("Vectorizer not found") vec = None print("Model loaded:") print(model.meta.info) return model, vec
def remove_model(name: str) -> bool: """Permanently removes the model from files at model_home Parameters ---------- name : str Qualified model name Returns ------- bool True if deletion succeeded, False otherwise """ model_home = load_config()["model"]["model_home"] dirpath = f"{model_home}/{name}" return _remove_entity(dirpath)
def scan_models() -> list[ModelMetaInfo]: """Returns the list of ModelMetaInfo for dirs in model_home, if there is no meta.json provided name and path is inferred""" model_home = usersetup.load_config()["model"]["model_home"] result: list[ModelMetaInfo] = list() # Get the list of paths and dataset names, skip the first dir (the parent dir) model_paths = glob(f"{model_home}/*/") for path in model_paths: name = path.split("\\")[-2] try: mi = ModelMetaInfo.from_file(path) result.append(mi) except FileNotFoundError: mi = ModelMetaInfo(name) result.append(mi) return result
def dump_model(model: VDSH): """Dumps the model and the vectorizer at data_home/fit_dataset to model_home/model.meta.name Intended to use after fitting a model with a given dataset Saved model cannot be refitted directly but has to be copied Parameters ---------- model : VDSH Model with meta info Returns ------- None """ config = load_config() model_home = config["model"]["model_home"] # Infer export model name and dataset name from meta info mi = model.meta model_name = mi.name dataset_name = mi.dataset_name model_dest = f"{model_home}/{model_name}" try: os.mkdir(model_dest) except FileExistsError: pass # Running predict to set up weights vocab_size = mi.vocab_size model.predict(np.zeros(shape=(1, vocab_size))) model.save(model_dest) mi.dump(model_dest) if dataset_name: datasets.copy_vectorizer(dataset_name, model_name)
def extract_train(dataset_name: str) -> Optional[np.ndarray]: """Extracts train subset of a given dataset if available Parameters ---------- dataset_name : str Qualified dataset name Returns ------- Optional[np.ndarray] The train subset of the dataset, a numpy ndarray with tfidf vectors as rows """ data_home = load_config()["model"]["data_home"] try: with h5py.File(f"{data_home}/{dataset_name}/data.hdf5", "r") as hf: train: np.ndarray = hf["train"][:] return train except (IOError, OSError): print(f"Cannot reach data.hdf5 in {dataset_name}") return None
def check_model_has_vectorizer(model_name: str): """Returns True if specified model has a vectorizer assigned, False otherwise""" data_home = load_config()["model"]["model_home"] return os.path.isfile(f"{data_home}/{model_name}/vectorizer.pkl")