if not os.path.exists(args.save_path): os.makedirs(args.save_path) if args.mode == 'eval': ckpt = args.load_from else: ckpt = os.path.join( args.save_path, 'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}' .format(args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, args.lr, args.batch_size, args.rho_size, args.train_embeddings)) ## define model and optimizer model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device) print('model: {}'.format(model)) if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr,
if args.mode == 'eval': ckpt = args.load_from else: ckpt = os.path.join( args.save_path, 'Dec17_etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}' .format(args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, args.lr, args.batch_size, args.rho_size, args.train_embeddings)) for num_topics in [10, 15, 20, 25, 30, 35, 40, 45, 50]: args.num_topics = num_topics ## define model and optimizer model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device) # print('model: {}'.format(model)) if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) def train(epoch): model.train() acc_loss = 0 acc_kl_theta_loss = 0 cnt = 0 indices = torch.randperm(args.num_docs_train) indices = torch.split(indices, args.batch_size) for idx, ind in enumerate(indices):
## ------------------------------------- ## Finally training ## ------------------------------------ print("## -------------------------------------") print("##\t TRAINING THE MODEL ") print("## -------------------------------------") # define model etm_model = ETM(num_topics=num_topics, vocab_size=vocab_size, t_hidden_size=t_hidden_size, rho_size=rho_size, emsize=emb_size, theta_act=theta_act, embeddings=embedding, train_embeddings=train_embeddings, enc_drop=enc_drop).to(device) print('model: {}'.format(etm_model)) optimizer = get_optimizer(name=_optimizer, model=etm_model) # Initialising the data structures best_epoch = 0 best_val_ppl = 1e9 all_val_ppls = [] # Let's get a sense of how bad the model is before training
dataset = dataset.map(parse, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.padded_batch(batch_size, (None, )) return dataset if __name__ == '__main__': vocab = [x.strip() for x in open(args.vocab_path, 'r').readlines()] vocab_dic = {x: i for i, x in enumerate(vocab)} # build model etm = ETM(num_topics=args.num_topics, rho_size=args.rho_size, theta_act=args.theta_act, train_embeddings=1, embeddings=None, topic_embeddings=None, enc_drop=0, vocab_size=len(vocab), t_hidden_size=args.t_hidden_size) input_layer = tf.keras.layers.Input(batch_shape=(None, None), dtype=tf.int32) model = tf.keras.Model(input_layer, etm(input_layer)) model.load_weights(args.weight_path) print(model.summary()) # loading data corpus = open(args.corpus, 'r').readlines() data = [[ vocab_dic[word] for word in x.strip().split() if word in vocab_dic ] for x in corpus]
os.path.join(config_dict['saving_models_path'][machine], 'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_' 'trainEmbeddings_{}'.format(config_dict['dataset'], config_dict['model_params']['num_topics'], config_dict['model_params']['t_hidden_size'], config_dict['optimization_params']['optimizer'], config_dict['optimization_params']['clip'], config_dict['model_params']['theta_act'], config_dict['optimization_params']['lr'], config_dict['batch_size'], config_dict['model_params']['rho_size'], config_dict['model_params']['train_embeddings'])) if config_dict['optimization_params']['mode'] == 'train': # define model and optimizer etm_model = ETM(config_dict=config_dict, machine=machine, embeddings=embeddings) print('model: {}'.format(etm_model)) optimizer = _set_optimizer() etm_model.fit(optimizer=optimizer, train_tokens=train_tokens, train_counts=train_counts, test_1_tokens=test_1_tokens, test_1_counts=test_1_counts, test_2_tokens=test_2_tokens, test_2_counts=test_2_counts, vocab=vocab, ckpt=ckpt) print('Visualizing model quality after training...') with open(ckpt, 'rb') as f: etm_model = torch.load(f)
tmp_emb = np.zeros(args.rho_size) for word in topic_words: tmp_emb += vectors[word] topic_embeddings[i] = tmp_emb / len(topic_words) topic_embeddings = np.float32(topic_embeddings) else: embeddings = None topic_embeddings = None # build model etm = ETM(num_topics=args.num_topics, rho_size=args.rho_size, theta_act=args.theta_act, train_embeddings=args.train_embeddings, embeddings=embeddings, topic_embeddings=topic_embeddings, enc_drop=args.enc_drop, vocab_size=du.vocab_size, t_hidden_size=args.t_hidden_size) input_layer = tf.keras.layers.Input(batch_shape=(None, None), dtype=tf.int32) model = tf.keras.Model(input_layer, etm(input_layer)) print(model.summary()) # loading data data = du.load_dataset(args.data_path, args.batch_size) # start training if not os.path.exists(args.save_path): os.makedirs(args.save_path)
os.makedirs(args.save_path) if args.mode == 'eval': ckpt = args.load_from else: ckpt = Path.cwd().joinpath(args.save_path, 'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format( args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, args.lr, args.batch_size, args.rho_size, args.train_embeddings)) ## define model and optimizer model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size, args.emb_size, args.theta_act, embeddings, args.train_embeddings, args.enc_drop).to(device) print('model: {}'.format(model)) optimizer = model.get_optimizer(args) tracemalloc.start() if args.mode == 'train': ## train model on data best_epoch = 0 best_val_ppl = 1e9
ckpt = \ os.path.join(config_dict['saving_models_path'][machine], 'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_' 'trainEmbeddings_{}'.format(config_dict['dataset'], config_dict['model_params']['num_topics'], config_dict['model_params']['t_hidden_size'], config_dict['optimization_params']['optimizer'], config_dict['optimization_params']['clip'], config_dict['model_params']['theta_act'], config_dict['optimization_params']['lr'], config_dict['batch_size'], config_dict['model_params']['rho_size'], config_dict['model_params']['train_embeddings'])) # define model and optimizer etm_model = ETM(config_dict=config_dict, machine=machine, embeddings=embeddings) print('model: {}'.format(etm_model)) optimizer = _set_optimizer() if config_dict['optimization_params']['mode'] == 'train': etm_model.fit(optimizer=optimizer, train_tokens=train_tokens, train_counts=train_counts, test_1_tokens=test_1_tokens, test_1_counts=test_1_counts, test_2_tokens=test_2_tokens, test_2_counts=test_2_counts, vocab=vocab, ckpt=ckpt)