def _read(self, file_path): mat = load_sparse(file_path) mat = mat.tolil() for ix in range(mat.shape[0]): instance = self.text_to_instance(vec=mat[ix].toarray().squeeze()) if instance is not None: yield instance
def __init__(self, vocab: Vocabulary, bow_embedder: TokenEmbedder, vae: VAE, apply_batchnorm_on_recon: bool = False, batchnorm_weight_learnable: bool = False, batchnorm_bias_learnable: bool = True, kl_weight_annealing: str = "constant", linear_scaling: float = 1000.0, sigmoid_weight_1: float = 0.25, sigmoid_weight_2: float = 15, reference_counts: str = None, reference_vocabulary: str = None, use_doc_info: str = False, use_background: str = False, background_data_path: str = None, update_background_freq: bool = False, track_topics: bool = True, track_npmi: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.metrics = {'nkld': Average(), 'nll': Average(), 'perp': Average()} self.vocab = vocab self.vae = vae self.track_topics = track_topics self.track_npmi = track_npmi self.vocab_namespace = "persona_based" self._update_background_freq = update_background_freq vocab_size = self.vocab.get_vocab_size(self.vocab_namespace) self._use_doc_info = use_doc_info # bp() if use_doc_info: self.interpolation = torch.nn.Parameter(torch.zeros(2, requires_grad=True)) self._background_freq = self.initialize_bg_from_file(file_=background_data_path) if use_background else 0 print(self._background_freq) # bp() self._ref_counts = reference_counts if reference_vocabulary is not None: # Compute data necessary to compute NPMI every epoch logger.info("Loading reference vocabulary.") self._ref_vocab = read_json(cached_path(reference_vocabulary)) self._ref_vocab_index = dict(zip(self._ref_vocab, range(len(self._ref_vocab)))) logger.info("Loading reference count matrix.") self._ref_count_mat = load_sparse(cached_path(self._ref_counts)) logger.info("Computing word interaction matrix.") self._ref_doc_counts = (self._ref_count_mat > 0).astype(float) self._ref_interaction = self._ref_doc_counts.T.dot(self._ref_doc_counts) self._ref_doc_sum = np.array(self._ref_doc_counts.sum(0).tolist()[0]) logger.info("Generating npmi matrices.") (self._npmi_numerator, self._npmi_denominator) = self.generate_npmi_vals(self._ref_interaction, self._ref_doc_sum) self.n_docs = self._ref_count_mat.shape[0] self._bag_of_words_embedder = bow_embedder self._kl_weight_annealing = kl_weight_annealing self._linear_scaling = float(linear_scaling) self._sigmoid_weight_1 = float(sigmoid_weight_1) self._sigmoid_weight_2 = float(sigmoid_weight_2) if kl_weight_annealing == "linear": self._kld_weight = min(1.0, 1 / self._linear_scaling) elif kl_weight_annealing == "sigmoid": self._kld_weight = float(1 / (1 + np.exp(-self._sigmoid_weight_1 * (1 - self._sigmoid_weight_2)))) elif kl_weight_annealing == "constant": self._kld_weight = 1.0 else: raise ConfigurationError("anneal type {} not found".format(kl_weight_annealing)) # setup batchnorm self._apply_batchnorm_on_recon = apply_batchnorm_on_recon if apply_batchnorm_on_recon: self.bow_bn = create_trainable_BatchNorm1d(vocab_size, weight_learnable=batchnorm_weight_learnable, bias_learnable=batchnorm_bias_learnable, eps=0.001, momentum=0.001, affine=True) # Maintain these states for periodically printing topics and updating KLD self._metric_epoch_tracker = 0 self._kl_epoch_tracker = 0 self._cur_epoch = 0 self._cur_npmi = 0.0 self.batch_num = 0 initializer(self)
ret += " %" return ret if __name__ == "__main__": import sys from glob import glob from scipy.stats import describe from pprint import pprint stdout_target = lambda m: open( f"eval_output_{m}_movies_vampire_persona.txt", "w") model_dir_name_func = lambda m: lambda k, p: f"{PROJ_DIR}/archives/vampire_persona/movies/K{k}P{p}-{m}-namefree/" # train = pickle.load(open("examples/movies/entity_based_namefree/train.pk", "rb")) dev = load_sparse( f"{PROJ_DIR}/examples/movies/vampire_persona_namefree/dev.npz") test = dev K_vals = [25, 50, 100] P_vals = [25, 50, 100] metrics_mean = { "name": { "VI": np.zeros((len(P_vals), len(K_vals))), "Purity": np.zeros((len(P_vals), len(K_vals))) }, "tvtrope": { "VI": np.zeros((len(P_vals), len(K_vals))), "Purity": np.zeros((len(P_vals), len(K_vals))) } } metrics_std = {
def __init__(self, vocab: Vocabulary, bow_embedder: TokenEmbedder, vae: VAE, doc_kl_weight_annealing: str = "constant", doc_linear_scaling: float = 1000.0, doc_sigmoid_weight_1: float = 0.25, doc_sigmoid_weight_2: float = 15, doc_saturation_period: int = 2, doc_period: int = 10, entity_kl_weight_annealing: str = "constant", entity_linear_scaling: float = 1000.0, entity_sigmoid_weight_1: float = 0.25, entity_sigmoid_weight_2: float = 15, entity_saturation_period: int = 2, entity_period: int = 10, reference_counts: str = None, reference_vocabulary: str = None, background_data_path: str = None, update_background_freq: bool = False, track_topics: bool = True, track_npmi: bool = True, visual_topic: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.metrics = { 'nkld': Average(), 'd_nkld': Average(), 'e_nkld': Average(), 'nll': Average() } self.vocab = vocab self.vae = vae self.track_topics = track_topics self.track_npmi = track_npmi self.visual_topic = visual_topic self.vocab_namespace = "entity_based" self._update_background_freq = update_background_freq # bp() self._background_freq = self.initialize_bg_from_file( file_=background_data_path) # bp() self._ref_counts = reference_counts self._npmi_updated = False import pickle # if dev_path is not None: # self.dev_set = pickle.load(open(dev_path, "rb")) if reference_vocabulary is not None: # Compute data necessary to compute NPMI every epoch logger.info("Loading reference vocabulary.") self._ref_vocab = read_json(cached_path(reference_vocabulary)) self._ref_vocab_index = dict( zip(self._ref_vocab, range(len(self._ref_vocab)))) logger.info("Loading reference count matrix.") self._ref_count_mat = load_sparse(cached_path(self._ref_counts)) logger.info("Computing word interaction matrix.") self._ref_doc_counts = (self._ref_count_mat > 0).astype(float) self._ref_interaction = (self._ref_doc_counts).T.dot( self._ref_doc_counts) self._ref_doc_sum = np.array( self._ref_doc_counts.sum(0).tolist()[0]) logger.info("Generating npmi matrices.") (self._npmi_numerator, self._npmi_denominator) = self.generate_npmi_vals( self._ref_interaction, self._ref_doc_sum) self.n_docs = self._ref_count_mat.shape[0] vocab_size = self.vocab.get_vocab_size(self.vocab_namespace) self._bag_of_words_embedder = bow_embedder self._doc_kl_weight_annealing = doc_kl_weight_annealing self._doc_linear_scaling = float(doc_linear_scaling) self._doc_sigmoid_weight_1 = float(doc_sigmoid_weight_1) self._doc_sigmoid_weight_2 = float(doc_sigmoid_weight_2) if doc_kl_weight_annealing == "linear": self._doc_kld_weight = min(1, 1 / self._doc_linear_scaling) elif doc_kl_weight_annealing == "sigmoid": self._doc_kld_weight = float( 1 / (1 + np.exp(-self._doc_sigmoid_weight_1 * (1 - self._doc_sigmoid_weight_2)))) elif doc_kl_weight_annealing == "constant": self._doc_kld_weight = 1.0 elif doc_kl_weight_annealing == "cyclic-linear": self._doc_period = doc_period self._doc_saturation_period = doc_saturation_period self._doc_cyclic_kl_anneal_tracker = 0 self._doc_kld_weight = 1 / self._doc_period else: raise ConfigurationError("anneal type(doc) {} not found".format( doc_kl_weight_annealing)) self._entity_kl_weight_annealing = entity_kl_weight_annealing self._entity_linear_scaling = float(entity_linear_scaling) self._entity_sigmoid_weight_1 = float(entity_sigmoid_weight_1) self._entity_sigmoid_weight_2 = float(entity_sigmoid_weight_2) if entity_kl_weight_annealing == "linear": self._entity_kld_weight = min(1, 1 / self._entity_linear_scaling) elif entity_kl_weight_annealing == "sigmoid": self._entity_kld_weight = float( 1 / (1 + np.exp(-self._entity_sigmoid_weight_1 * (1 - self._entity_sigmoid_weight_2)))) elif entity_kl_weight_annealing == "constant": self._entity_kld_weight = 1.0 elif entity_kl_weight_annealing == "cyclic-linear": self._entity_period = entity_period self._entity_saturation_period = entity_saturation_period self._entity_cyclic_kl_anneal_tracker = 0 self._entity_kld_weight = 1 / self._entity_period else: raise ConfigurationError("anneal type(entity) {} not found".format( entity_kl_weight_annealing)) # setup batchnorm self.doc_bow_bn = torch.nn.BatchNorm1d(vocab_size, eps=0.001, momentum=0.001, affine=True) self.doc_bow_bn.weight.data.copy_( torch.ones(vocab_size, dtype=torch.float64)) self.doc_bow_bn.weight.requires_grad = False # self.entity_bow_bn = torch.nn.BatchNorm1d(vocab_size, eps=0.001, momentum=0.001, affine=True) # self.entity_bow_bn.weight.data.copy_(torch.ones(vocab_size, dtype=torch.float64)) # self.entity_bow_bn.weight.requires_grad = False # Maintain these states for periodically printing topics and updating KLD self._metric_epoch_tracker = 0 self._kl_epoch_tracker = 0 self._cur_epoch = 0 self._cur_entity_npmi = 0.0 self._cur_doc_npmi = 0.0 self.batch_num = 0 initializer(self)