from lda2vec import utils, b_model import numpy as np import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" # Path to preprocessed data data_path = "data/clean_data" # Whether or not to load saved embeddings file load_embeds = True # Load data from files (idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix, bias_idxes) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds, load_bias_idxes=True) bias_words = ['privacy', 'anonymity', 'confidentiality', 'disclosure'] bias_idxes = [word_to_idx[word] for word in bias_words] # Number of unique documents num_docs = len(np.unique(doc_ids)) # Number of unique words in vocabulary (int) vocab_size = embed_matrix.shape[0] # Embed layer dimension size # If not loading embeds, change 128 to whatever size you want. embed_size = embed_matrix.shape[1] if load_embeds else 128 # Number of topics to cluster into num_topics = 20 # Number of topics to bias
model_dir = "tests/webhose_50k/model/v3" MODEL_RESTORE = False if os.path.exists("{}/model.ckpt.meta".format(model_dir)): MODEL_RESTORE = True if not os.path.exists(model_dir): os.makedirs(model_dir) # Whether or not to load saved embeddings file load_embeds = True # Load data from files (idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(clean_data_dir, load_embed_matrix=load_embeds) # Number of unique documents num_docs = doc_ids.max() + 1 # Number of unique words in vocabulary (int) vocab_size = len(freqs) # Embed layer dimension size # If not loading embeds, change 128 to whatever size you want. embed_size = embed_matrix.shape[1] if load_embeds else 128 # Number of topics to cluster into num_topics = 15 # Amount of iterations over entire dataset num_epochs = 5 # Batch size - Increase/decrease depending on memory usage batch_size = 8192 # Epoch that we want to "switch on" LDA loss
from lda2vec import utils, s_model import numpy as np import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" # Path to preprocessed data data_path = "data/clean_data" # Whether or not to load saved embeddings file load_embeds = True # Load data from files (idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds) seed_words = ['privacy', 'anonymity', 'confidentiality', 'disclosure'] seed_idxes = [word_to_idx[word] for word in seed_words if word in word_to_idx] base_seed_idxes = [word_to_idx[word] for word in seed_words] seed_idxes = [[base_seed_idxes[0], base_seed_idxes[1]], [base_seed_idxes[0], base_seed_idxes[2]], [base_seed_idxes[0], base_seed_idxes[3]], [base_seed_idxes[0]], [base_seed_idxes[2]]] # Number of unique documents num_docs = len(np.unique(doc_ids)) # Number of unique words in vocabulary (int) vocab_size = embed_matrix.shape[0] # Embed layer dimension size
from lda2vec import utils, s_model import numpy as np import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" # Path to preprocessed data data_path = "data/clean_data" # Whether or not to load saved embeddings file load_embeds = True # Load data from files (idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data( data_path, load_embed_matrix=load_embeds) seed_words = ['privacy', 'anonymity','confidentiality','disclosure'] seed_idxes = [word_to_idx[word] for word in seed_words if word in word_to_idx] base_seed_idxes = [word_to_idx[word] for word in seed_words] seed_idxes = [[base_seed_idxes[0], base_seed_idxes[1]], [base_seed_idxes[0], base_seed_idxes[2]], [base_seed_idxes[0], base_seed_idxes[3]], [base_seed_idxes[0]] [base_seed_idxes[2]] # Number of unique documents num_docs = len(np.unique(doc_ids)) # Number of unique words in vocabulary (int) vocab_size = embed_matrix.shape[0] # Embed layer dimension size
# Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove(EMBEDDING_DIR + "/" + "glove.6B.100d.txt") else: embedding_matrix = None # Save data to data_dir P.save_data(clean_data_dir, embedding_matrix=embedding_matrix) # Load data from files (idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(CLEAN_DATA_DIR, load_embed_matrix=load_embeds) # Number of unique documents num_docs = doc_ids.max() + 1 # Number of unique words in vocabulary (int) vocab_size = len(freqs) # Embed layer dimension size # If not loading embeds, change 128 to whatever size you want. embed_size = embed_matrix.shape[1] if load_embeds else 128 # Number of topics to cluster into num_topics = 20 # Amount of iterations over entire dataset num_epochs = 200 # Batch size - Increase/decrease depending on memory usage batch_size = 4096 # Epoch that we want to "switch on" LDA loss
from lda2vec import utils, model data_path = "data" run_name = "my_run" num_topics = 20 num_epochs = 20 (idx_to_word, word_to_idx, freqs, embed_matrix, pivot_ids, target_ids, doc_ids, num_docs, vocab_size, embed_size) = utils.load_preprocessed_data(data_path, run_name) m = model(num_docs, vocab_size, num_topics=num_topics, embedding_size=embed_size, load_embeds=True, pretrained_embeddings=embed_matrix, freqs=freqs) m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=5) utils.generate_ldavis_data(data_path, run_name, m, idx_to_word, freqs, vocab_size)