def __init__(self, V=10000): self.vocab = None self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip" self.target_names = None # set by self.process() # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) print("Downloading treebank to {:s}".format(data_dir)) self.zipped_filename = download_sst(data_dir) print("Loading SST from {:s}".format(self.zipped_filename)) self.train_trees = self.get_trees("train") print("Training set: {:,} trees".format(len(self.train_trees))) self.dev_trees = self.get_trees("dev") print("Development set: {:,} trees".format(len(self.dev_trees))) self.test_trees = self.get_trees("test") print("Test set: {:,} trees".format(len(self.test_trees))) # Verify that number of sentences matches the published size. assert (len(self.train_trees) == 8544) assert (len(self.dev_trees) == 1101) assert (len(self.test_trees) == 2210) # Build vocabulary over training set print("Building vocabulary - ", end="") train_words = utils.flatten( self.canonicalize(t.leaves()) for t in self.train_trees) self.vocab = vocabulary.Vocabulary(train_words, size=V) print("{:,} words".format(self.vocab.size))
def __init__(self, ndim=50): assert(ndim in self._AVAILABLE_DIMS) self.vocab = None self.W = None self.zipped_filename = "data/glove/glove.6B.zip" # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) print("Downloading GloVe vectors to {:s}".format(data_dir)) self.zipped_filename = download_glove(data_dir) print("Loading vectors from {:s}".format(self.zipped_filename)) words, W = parse_glove_file(self.zipped_filename, ndim) # Set nonzero value for special tokens half = W.shape[0]//2 mean_vec = np.mean(W[3:], axis=0) random_1 = np.mean(W[3:half], axis=0) random_2 = np.mean(W[half:], axis=0) #for i in range(3): # W[i] = mean_vec W[0] = mean_vec W[1] = random_1 W[2] = random_2 self.W = W self.vocab = vocabulary.Vocabulary(words[3:]) assert(self.vocab.size == self.W.shape[0])
def __init__(self, ndim=50, quiet=False): assert (ndim in self._AVAILABLE_DIMS) self.vocab = None self.W = None self.zipped_filename = "data/glove/glove.6B.zip" # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) if not quiet: print("Downloading GloVe vectors to {:s}".format(data_dir)) self.zipped_filename = download_glove(data_dir) if not quiet: print("Loading vectors from {:s}".format(self.zipped_filename)) words, W = parse_glove_file(self.zipped_filename, ndim, quiet) # Set nonzero value for special tokens mean_vec = np.mean(W[3:], axis=0) for i in range(3): W[i] = mean_vec self.W = W self.vocab = vocabulary.Vocabulary(words[3:]) assert (self.vocab.size == self.W.shape[0])
def setUp(self): sequence = ["a", "b", "c", "d"] self.vocab = vocabulary.Vocabulary(sequence) ids = self.vocab.words_to_ids(sequence) self.train_ids = np.array(ids * 50000, dtype=int) self.test_ids = np.array(ids * 100, dtype=int) model_params = dict(V=self.vocab.size, H=10, softmax_ns=2, num_layers=1) self.lm = rnnlm.RNNLM(**model_params) self.lm.BuildCoreGraph() self.lm.BuildTrainGraph() self.lm.BuildSamplerGraph() # For toy model, ignore sampled softmax. self.lm.train_loss_ = self.lm.loss_
# In[39]: reload(ds) post, mbti_type, user = ds.splitPosts(df) # Split data: 80% train, 20% test post_train, post_test, label_train, label_test = train_test_split(post, mbti_type, test_size=0.2, random_state=88) print("MBIT posts", post_train[:5]) print('') print("MBTI Labels: ",label_train[:5]) # Build a vocabulary (V size is defaulted to full text) for train corpus vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in post_train)) print("Vocab Size: ",vocab_mbti.size) # tokenize and canonicalize train and test sets x_train = [] for post in post_train: x_train.append(vocab_mbti.words_to_ids(post.split())) x_test = [] for post in post_test: x_test.append(vocab_mbti.words_to_ids(post.split())) reload(ds) y_train, y_test = ds.one_hot_label(mbti_type, label_train, label_test) y_train_id, y_test_id, label_map = ds.label_to_id(mbti_type, label_train, label_test)
def full_vocab_canon(x): # Build a vocabulary (V size is defaulted to full text) for train corpus vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in x)) print("Full Vocab Built, size: ", vocab_mbti.size) return vocab_mbti.size, vocab_mbti
def generate_vocab(self, max_size=None): self._generate_word_list() self.vocab = vocabulary.Vocabulary(self.word_list, size=max_size)