def derive_pca_on_glove(paragraph, num_dimensions_to_accumulate): """ Helper function to get PCA decomposition of GLOVE vectors in paragraph """ global glove if glove == None: glove = utils.glove2dict(os.path.join(path_to_glove, 'glove.6B.%dd.txt' % GLOVE_SIZE)) approx_num_words = int(1.5*len(paragraph.split())) word_vector = np.zeros((approx_num_words, GLOVE_SIZE)) row = 0 words = [] for sent in sent_tokenize(paragraph): for word in word_tokenize(sent): word = word.lower() if word in glove and word not in words: word_vector[row] = glove[word] words.append(word) row += 1 word_vector = word_vector[:row,:] #print words pca = PCA(n_components = num_dimensions_to_accumulate) pca.fit(word_vector) return pca
def createEmbedMatrix(embeddingFileName="../../../glove.6B.50d.txt"): vocab = utils.glove2dict(embeddingFileName) emb_matrix = np.zeros((len(vocab) + 2, vocab["the"].shape[0])) word2Index = {} index2Word = {} index = 0 for word in vocab: emb_matrix[index] = vocab[word] word2Index[word] = index index2Word[index] = word index += 1 # Add UNK word2Index["<UNK>"] = len(word2Index) index2Word[word2Index["<UNK>"]] = "<UNK>" unkValue = [] # pick unkValue as random values for i in range(0, vocab["the"].size): randomKey = random.choice(list(vocab.keys())) unkValue.append(vocab[randomKey][i]) emb_matrix[word2Index["<UNK>"]] = np.asarray(unkValue) # Add PAD word2Index["<PAD>"] = len(word2Index) index2Word[word2Index["<PAD>"]] = "<PAD>" emb_matrix[word2Index["<PAD>"]] = np.zeros_like(vocab["the"]) return emb_matrix, word2Index, index2Word
def analysis3( fileName, gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt" ): included = 0 count = [] vocab = utils.glove2dict( gloveFileName) # dict[word] -> numpy array(embed_dim,) with open(fileName) as f: lineNum = 0 for line in f: if lineNum % 4 != 0: lineNum += 1 continue firstStartIdx = line.find('<e1>') + 4 firstEndIdx = line.find('</e1>') secondStartIdx = line.find('<e2>') + 4 secondEndIdx = line.find('</e2>') if firstStartIdx == -1 or firstEndIdx == -1 or secondStartIdx == -1 or secondEndIdx == -1: print("ERROR") firstEntity = line[firstStartIdx:firstEndIdx] secondEntity = line[secondStartIdx:secondEndIdx] #print(firstEntity, secondEntity) if firstEntity not in vocab: count.append(firstEntity) else: included += 1 if secondEntity not in vocab: count.append(secondEntity) else: included += 1 lineNum += 1 print(included) print(len(count))
def derive_pca_on_glove(paragraph, num_dimensions_to_accumulate): """ Helper function to get PCA decomposition of GLOVE vectors in paragraph """ global glove if glove == None: glove = utils.glove2dict( os.path.join(path_to_glove, 'glove.6B.%dd.txt' % GLOVE_SIZE)) approx_num_words = int(1.5 * len(paragraph.split())) word_vector = np.zeros((approx_num_words, GLOVE_SIZE)) row = 0 words = [] for sent in sent_tokenize(paragraph): for word in word_tokenize(sent): word = word.lower() if word in glove and word not in words: word_vector[row] = glove[word] words.append(word) row += 1 word_vector = word_vector[:row, :] #print words pca = PCA(n_components=num_dimensions_to_accumulate) pca.fit(word_vector) return pca
def dimensional_decomposition(paragraph, unused_parse, num_dimensions_to_accumulate=5): """ Gets the extent to which the word embeddings used in a paragraph can be reduced to to low-dimensional space. Low-dimensional space is derived by PCA. Central insight: If the words all lay in a low-dimensional space, then likely they are expressing variations on the same theme rather than nuanced argument. Confounding variable of paragraph length actually seems good here. Empirical results: This works, and it captures something about content rather than just matching on highly functional words and phrases. On toy dataset, the first dimension has a correlation of -0.51 with the score -- this is extremely encouraging! The second-fifth cumulative dimensions are even stronger, all at about -0.56. Best to use a smaller number of dimensions (like 50), else we risk a highly overdetermined system of equations when heading into PCA. Returns: dictionary, with keys: cum_pca_var_expl_# (where # is in range [0, num_dimensions_to_accumulate)); each value gives the amount of variance explained when that number of dimensions is considered """ global glove if glove == None: glove = utils.glove2dict( os.path.join(path_to_glove, 'glove.6B.%dd.txt' % GLOVE_SIZE)) approx_num_words = int(1.5 * len(paragraph.split())) word_vector = np.zeros((approx_num_words, GLOVE_SIZE)) row = 0 words = [] for sent in sent_tokenize(paragraph): for word in word_tokenize(sent): word = word.lower() if word in glove and word not in words: word_vector[row] = glove[word] words.append(word) row += 1 word_vector = word_vector[:row, :] #print words pca = PCA() pca.fit(word_vector) features = Counter() for i in range(num_dimensions_to_accumulate): features["cum_pca_var_expl_%d" % i] = np.sum( pca.explained_variance_ratio_[:i + 1]) return features
def load_data_embedding(glove6B=False): X_train, y_train = build_dataset('train', max_sen_length) X_dev, y_dev = build_dataset('dev', max_sen_length) embedding_weights = np.zeros((vocab_size, vocab_dim)) if vocab_dim == 50: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt')) elif vocab_dim == 100: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt')) elif vocab_dim == 200: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.200d.txt')) elif vocab_dim == 300: if glove6B: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.300d.txt')) else: glove_home = 'glove_dir/glove.840B' GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.840B.300d.txt')) for word, index in wordMap.items(): if word in GLOVE: embedding_weights[index, :] = GLOVE[word] else: embedding_weights[index, :] = utils.randvec(vocab_dim) return X_train, y_train, X_dev, y_dev, embedding_weights
def loadData(): vocab = utils.glove2dict( "../../../glove.6B.50d.txt") # dict[word] -> numpy array(embed_dim,) # self.embedding_matrix: (embed_dim, vocab_size) embedding_matrix = np.zeros((len(vocab["and"]), len(vocab))) word2Index = {} index2Word = {} counter = 0 for word, vec in vocab.items(): embedding_matrix[:, counter] = vec word2Index[word] = counter index2Word[counter] = word counter += 1 return embedding_matrix, word2Index, index2Word
def loadData( gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt" ): vocab = utils.glove2dict( gloveFileName) # dict[word] -> numpy array(embed_dim,) embedding_matrix = np.zeros((len(vocab["and"]), len(vocab))) word2Index = {} index2Word = {} counter = 0 for word, vec in vocab.items(): embedding_matrix[:, counter] = vec word2Index[word] = counter index2Word[counter] = word counter += 1 return embedding_matrix, word2Index, index2Word
def generate_glove_embedding(dim=300): if dim not in GLOVE_EMBEDDING_FILE: print("GloVe file of dim {} is not found.".format(dim)) return None glove_file_path = os.path.join(EMBEDDING_FOLDER, GLOVE_EMBEDDING_FILE[dim]) glove_lookup = glove2dict(glove_file_path, dim) vocal = ['$UNK'] embedding = [list(np.random.uniform(low=-1.0, high=1.0, size=dim))] for key, value in glove_lookup.items(): vocal.append(key) embedding.append(list(value)) embedding = np.array(embedding, dtype=np.float) print("Vocabulary size: {}, Embedding size{}".format(len(vocal), embedding.shape)) return [vocal, embedding]
def build_glove_featurized_dataset(df, dim=300, np_func=np.sum): if dim not in GLOVE_EMBEDDING_FILE: print("GloVe file of dim {} is not found.".format(dim)) return None glove_file_path = os.path.join(EMBEDDING_FOLDER, GLOVE_EMBEDDING_FILE[dim]) glove_lookup = glove2dict(glove_file_path, dim) feature_matrix = [] labels = [] for index, row in df.iterrows(): feature_matrix.append(glove_featurizer(row['tweet'], glove_lookup, np_func)) encoded_label = 1 if row['subtask_a'] == 'OFF' else 0 labels.append(encoded_label) return {'X': np.array(feature_matrix), 'y': labels}
def create_glove_embedding(vocab, glove_base_filename='glove.6B.50d.txt'): # Use `utils.glove2dict` to read in the GloVe file: ##### YOUR CODE HERE glove2dict = utils.glove2dict(os.path.join(GLOVE_HOME, glove_base_filename)) # Use `utils.create_pretrained_embedding` to create the embedding. # This function will, by default, ensure that START_TOKEN, # END_TOKEN, and UNK_TOKEN are included in the embedding. ##### YOUR CODE HERE embedding, vocab = utils.create_pretrained_embedding(glove2dict, vocab) # Be sure to return the embedding you create as well as the # vocabulary returned by `utils.create_pretrained_embedding`, # which is likely to have been modified from the input `vocab`. ##### YOUR CODE HERE return embedding, vocab
def __init__(self, opt, train): self.opt = opt self.train = train if self.opt.use_pretrained_embeddings: self.glv_dict = glove2dict(self.opt.glove_dir) self.preprocess_sentences() self.get_words() if self.opt.use_pretrained_embeddings: self.create_word_dict_glove() self.create_word_count_glove() else: self.create_word_dict() self.create_word_count() self.len = len(self.sentences) del self.sentences_all
def __init__(self, opt, train): self.opt = opt self.train = train self.file = open(self.opt.data_dir + self.opt.input_file, "r", encoding='utf-8', errors='ignore').read() if self.opt.use_pretrained_embeddings: self.glv_dict = glove2dict(self.opt.glove_dir) self.preprocess_sentences() self.get_words() if self.opt.use_pretrained_embeddings: self.create_word_dict_glove() self.create_word_count_glove() else: self.create_word_dict() self.create_word_count() self.len = len(self.sentences)
import utils import random import os import tensorflow as tf from tf_rnn_classifier import TfRNNClassifier from collections import Counter glove_dimensionality = 100 home = '..' base_glove = 'glove.6B' glove_home = os.path.join(home, base_glove) glove_text_file = '.' + str(glove_dimensionality) + 'd.txt' glove_lookup = utils.glove2dict(glove_home + glove_text_file) def get_vocab(X, n_words=None): """Get the vocabulary for an RNN example matrix `X`, adding $UNK$ if it isn't already present. Parameters ---------- X : list of lists of str n_words : int or None If this is `int > 0`, keep only the top `n_words` by frequency. Returns ------- list of str
def fit_maxent_classifier(X, y): mod = LogisticRegression(fit_intercept=True, solver='liblinear', multi_class='auto') mod.fit(X, y) return mod # ### GloVe inputs # # To illustrate this process, we'll use the general purpose GloVe representations released by the GloVe team, at 300d: # In[6]: glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.300d.txt')) # In[7]: def vsm_leaves_phi(tree, lookup, np_func=np.sum): """Represent `tree` as a combination of the vector of its words. Parameters ---------- tree : nltk.Tree lookup : dict From words to vectors. np_func : function (default: np.sum) A numpy matrix operation that can be applied columnwise, like `np.mean`, `np.sum`, or `np.prod`. The requirement is that
# Basically everything from the Jupyter notebook, but in a single runnable file. import numpy as np import os import sst import tf_trnn import tf_lifted_trnn import time import utils vsmdata_home = 'glove.6B' glove_home = vsmdata_home glove_lookup_100 = utils.glove2dict( os.path.join(glove_home, 'glove.6B.100d.txt')) ''' glove_lookup_200 = utils.glove2dict( os.path.join(glove_home, 'glove.6B.200d.txt')) glove_lookup_300 = utils.glove2dict( os.path.join(glove_home, 'glove.6B.300d.txt')) ''' glove_lookup_200 = {} glove_lookup_300 = {} def run_experiment(eta, embed, model, phrase): print("===================================================") print("eta: %s, embed_dim: %s, model: %s, phrase-level: %s" % (eta, embed, model, phrase)) print("===================================================") if embed == 100:
def randvec(w, n=50, lower=-1.0, upper=1.0): """Returns a random vector of length `n`. `w` is ignored.""" return utils.randvec(n=n, lower=lower, upper=upper) # In[18]: # Any of the files in glove.6B will work here: glove_dim = 50 glove_src = os.path.join(GLOVE_HOME, 'glove.6B.{}d.txt'.format(glove_dim)) # Creates a dict mapping strings (words) to GloVe vectors: GLOVE = utils.glove2dict(glove_src) def glove_vec(w): """Return `w`'s GloVe representation if available, else return a random vector.""" return GLOVE.get(w, randvec(w, n=glove_dim)) # ### Combining words into inputs: vector_combo_func # Here we decide how to combine the two word vectors into a single representation. In more detail, where `u` is a vector representation of the left word and `v` is a vector representation of the right word, we need a function `vector_combo_func` such that `vector_combo_func(u, v)` returns a new input vector `z` of dimension `m`. A simple example is concatenation: # In[19]:
def convertSentencesToIdx( relation2IdFileName="SemEval2010_task8_all_data/cleaned/cleaned_entity2Id.txt", trainFileName="SemEval2010_task8_all_data/SemEval2010_task8_training/small_TRAIN_FILE.TXT", gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt" ): vocab = utils.glove2dict( gloveFileName) # dict[word] -> numpy array(embed_dim,) word2Index = {} createWord2Index(vocab, word2Index) addUnk(vocab, word2Index) addPadding(vocab, word2Index) writeDictToFile(word2Index) relation2Id = {} readDictFromFile(relation2Id, relation2IdFileName) count = 0 cleanedFile = open("SemEval2010_task8_all_data/cleaned/cleaned_train.txt", 'w+') with open(trainFileName) as f: lineNum = 0 table = str.maketrans( {key: None for key in string.punctuation if key != "-"}) for line in f: if lineNum % 4 == 2 or lineNum % 4 == 3: lineNum += 1 continue elif lineNum % 4 == 1: lineNum += 1 lineToWrite = relation2Id[line[:line.find("(")]] + lineToWrite cleanedFile.write(lineToWrite) continue lineCount = getLineCount(line) # Remove all the words in the middle of brackets firstStartIdx = line.find('<e1>') + len("<e1>") firstEndIdx = line.find('</e1>') firstEntity = line[firstStartIdx:firstEndIdx + 5] line = line.replace(firstEntity, "") secondStartIdx = line.find('<e2>') + len("<e2>") secondEndIdx = line.find('</e2>') secondEntity = line[secondStartIdx:secondEndIdx + 5] line = line.replace(secondEntity, "") line = line.translate(table) line = line.lower() tokens = line.split() lineToWrite = "" # Write sentence id lineToWrite += " " + str(tokens[0]) for token in tokens[1:]: lineToWrite += " " if "e1" in token: lineToWrite += "<e1>" elif "e2" in token: lineToWrite += "<e2>" elif token in word2Index: lineToWrite += str(word2Index[token]) else: lineToWrite += str(word2Index["<UNK>"]) lineToWrite += ("\n") lineNum += 1 ''' # Sanity check: if "<e1>" not in lineToWrite or "<e2>" not in lineToWrite: print("ERROR") print(lineToWrite) if lineCount != len(lineToWrite.split()): count += 1 print(lineCount) print(len(lineToWrite.split())) print("ERROR") print(lineToWrite) ''' cleanedFile.close() return vocab, word2Index, relation2Id
processed = [process(piece) for piece in pieces] return torch.stack(processed, dim=2) # (batch_size, output_dim, num_pieces) if __name__ == "__main__": # Test PiecewiseCNN c1 = torch.rand( 3, 4, 7, requires_grad=True) # (batch_size, embed_size, sequence_len) c2 = torch.rand(3, 4, 2, requires_grad=True) pcnn = PiecewiseCNN(4, output_dim=10) out = pcnn(c1, c2) # (batch_size, output_dim, num_pieces) print("PiecewiseCNN test:") print("--- out.size() == (3, 10, 2):", out.size() == (3, 10, 2)) print() # Test RelationClassifier vocab = utils.glove2dict( "data/glove.6B.50d.txt") # dict[word] -> numpy array(embed_dim,) rc = RelationClassifier(vocab, 50) X = [(["first", "piece"], ["second", "piece"], ["third", "piece"])] y = [0] print("RelationClassifier test:") print( "--- _assemble_vec_seq:\n", rc._assemble_vec_seq(['apple', 'banana', 'coconut', 'durian', 'apple'])[:, :3]) out = rc(X, y) print("--- forward:\n", out.size())
def load_glove50(): glove_src = os.path.join(GLOVE_HOME, 'glove.6B.100d.txt') # Creates a dict mapping strings (words) to GloVe vectors: GLOVE = utils.glove2dict(glove_src) print("The number of items in glove is %d" % len(GLOVE)) return GLOVE
train_filename = sys.argv[1] f_train = open(train_filename, 'r') cnt = 0 X = np.zeros(shape=(total_words, total_songs)) for line in f_train: args = line.split(',') row = np.array(args[1:]) X[:, cnt] = row cnt += 1 if cnt % 1000 == 0: print cnt X = tfidf(X) glove_home = 'glove.6B' GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt')) Y = np.zeros(shape=(total_songs, 100)) word_list = extract_top_words() for i in range(total_songs): order = np.argsort(X[:, i]) words = [] for j in range(top_words): words.append(word_list[order[j]]) allvecs = np.array([GLOVE[w] for w in words if w in GLOVE]) feature = np.sum(allvecs, axis=0) Y[i, :] = feature k_means = cluster.KMeans(n_clusters=10, n_init=4)
def test_glove2dict(): src_filename = os.path.join("data", "glove.6B", "glove.6B.50d.txt") data = utils.glove2dict(src_filename) assert len(data) == 400000
def test_glove2dict(): src_filename = os.path.join("vsmdata", "glove.6B", "glove.6B.50d.txt") utils.glove2dict(src_filename)
# In[11]: wn_edges = get_wordnet_edges() # ### Reproducing the WordNet synonym graph experiment # For our VSM, let's use the 300d file included in this distribution from the GloVe team, as it is close to or identical to the one used in the paper: # # http://nlp.stanford.edu/data/glove.6B.zip # # If you download this archive, place it in `vsmdata`, and unpack it, then the following will load the file into a dictionary for you: # In[12]: glove_dict = utils.glove2dict( os.path.join(data_home, 'glove.6B', 'glove.6B.300d.txt')) # This is the initial embedding space $\widehat{Q}$: # In[13]: X_glove = pd.DataFrame(glove_dict).T # In[14]: X_glove.T.shape # Now we just need to replace all of the strings in `edges` with indices into `X_glove`: # In[15]:
def __init__(self, dropout_prob=0.7, **kwargs): self.dropout_prob = dropout_prob self.glove_dim = 100 glove_src = os.path.join(GLOVE_HOME, 'glove.6B.{}d.txt'.format(self.glove_dim)) self.glove_dict = utils.glove2dict(glove_src) super().__init__(**kwargs)
train_filename = sys.argv[1] f_train = open(train_filename, 'r') cnt = 0 X = np.zeros(shape=(total_words, total_songs)) for line in f_train: args = line.split(',') row = np.array(args[1:]) X[:, cnt] = row cnt += 1 if cnt % 1000 == 0: print cnt X = tfidf(X) glove_home = 'glove.6B' GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt')) Y = np.zeros(shape=(total_songs, 100)) word_list = extract_top_words() for i in range(total_songs): order = np.argsort(X[:, i]) words = [] for j in range(top_words): words.append(word_list[order[j]]) allvecs = np.array([GLOVE[w] for w in words if w in GLOVE]) feature = np.sum(allvecs, axis=0) Y[i,:] = feature
def run(): # train_dataset = NYT10Dataset('data/small_train.txt', 'data/relation2id.txt') # val_dataset = NYT10Dataset('data/small_val.txt', 'data/relation2id.txt') train_dataset = NYT10Dataset('data/train.txt', 'data/relation2id.txt') val_dataset = NYT10Dataset('data/val.txt', 'data/relation2id.txt') # test_dataset = NYT10Dataset('data/test.txt', 'data/relation2id.txt') # Use GPU if available, otherwise stick with cpu use_cuda = torch.cuda.is_available() torch.manual_seed(123) device = torch.device('cuda' if use_cuda else 'cpu') print(device) # if use_parallel: # print("[Using all the available GPUs]") # inception = nn.DataParallel(inception, device_ids=[0]) vocab = utils.glove2dict("data/glove.6B.50d.txt") # dict[word] -> numpy array(embed_dim,) rc_model = RelationClassifier(vocab, 50, train_dataset.num_relations(), device=device).to(device) rc_model.apply(init_weights) def collate_fn(batch): X, y = zip(*batch) return X, torch.LongTensor(y) trainset_loader = DataLoader(train_dataset, batch_size=50, shuffle=True, num_workers=20, collate_fn=collate_fn) valset_loader = DataLoader(val_dataset, batch_size=50, shuffle=False, num_workers=10, collate_fn=collate_fn) best_model_filepath = 'models/model_best.weighted.1e-1.pth.tar' stats_filepath = 'train_log.txt' dataloaders = {'train': trainset_loader, 'val': valset_loader} dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)} loss_weights = torch.ones(train_dataset.num_relations(), device=device) loss_weights[0] = 1e-1 # loss_weights[48] = 1e-2 criterion = nn.CrossEntropyLoss(weight=loss_weights) optimizable_params = [param for param in rc_model.parameters() if param.requires_grad] optimizer = torch.optim.Adam(optimizable_params, lr=0.01) exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) num_epochs = 20 # load_saved_model(best_model_filepath, rc_model, optimizer) # best_model = rc_model best_model = train_model(rc_model, dataloaders, dataset_sizes, criterion, optimizer, exp_lr_scheduler, use_cuda, best_model_filepath, num_epochs) predictions = evaluate_model(best_model, valset_loader, len(val_dataset), use_cuda) true_y = [y for _, y in val_dataset] report = classification_report(true_y, predictions) with open(stats_filepath, 'a') as f: f.write(report) print(report)