def __init__(self, turkCount, addTurkerOneHot): super(AdjEmb, self).__init__() cwd = os.getcwd() path = cwd + "/data/" #load a subset of glove which contains embeddings for the adjectives we have self.vocab, self.vec = torchwordemb.load_glove_text( path + "glove_our_adj.csv") self.noOfTurkers = turkCount self.embeddings = nn.Embedding(self.vec.shape[0], self.vec.shape[1]) self.embeddings.weight.data.copy_(self.vec) #dont update embeddings self.embeddings.weight.requires_grad = False self.linear1 = nn.Linear(self.vec.size(1), dense1_size) #the last step: whatever the output of previous layer was concatenate it with the mu and sigma and one-hot vector for turker if (addTurkerOneHot): self.fc = torch.nn.Linear(dense1_size + turkCount + 2, 1) else: #use this when you dont have one hot for turkers self.fc = torch.nn.Linear(dense1_size + 2, 1)
def test_glove_text(self): word, vec = torchwordemb.load_glove_text("resource/glove.test.txt") self.assertEqual(len(word), 10) self.assertEqual(vec.size(0), 10) self.assertEqual(vec.size(1), 300)
def vector_loader_new(text_field_words): path = 'word_embedding/glove.sentiment.conj.pretrained.txt' words_dict, vec = torchwordemb.load_glove_text(path) embed_size = vec.size(1) # match count_list2 = [] count = 0 dict_cat = [] for word in text_field_words: if word in words_dict: count += 1 dict_cat.append(words_dict[word]) else: dict_cat.append([0.0] * embed_size) count += 1 count_list2.append(count - 1) count_data = len(text_field_words) - len(count_list2) # modify zero sum = [] for j in range(embed_size): sum_col = 0.0 for i in range(len(dict_cat)): sum_col += dict_cat[i][j] sum_col = float(sum_col / count_data) sum_col = round(sum_col, 6) sum.append(sum_col) for i in range(len(count_list2)): dict_cat[count_list2[i]] = sum return dict_cat
def read_embed(embed_path, LM): if LM == "glove": vocab, vec = torchwordemb.load_glove_text(embed_path) else: vocab, vec = torchwordemb.load_word2vec_text(embed_path) return vocab, vec
def get_word_index(self, padding_marker='__PADDING__', unknown_marker='__UNK__',): _vocab, _vec = torchwordemb.load_glove_text(self.path_glove) vocab = {padding_marker:0, unknown_marker:1} for tkn, indx in _vocab.items(): vocab[tkn] = indx + 2 vec_2 = torch.zeros((2, _vec.size(1))) vec_2[1].normal_() self.vec = torch.cat((vec_2, _vec)) self.vocab = vocab return self.vocab, self.vec
def load_glove(path, input_dial, glove_dim, load_glove): if load_glove: voc, vec = torchwordemb.load_glove_text(path) else: if glove_dim == 100: voc = pickle.load(open('./data/voc_100', 'rb')) vec = pickle.load(open('./data/vec_100', 'rb')) elif glove_dim == 50: voc = pickle.load(open('./data/voc', 'rb')) vec = pickle.load(open('./data/vec', 'rb')) input_dial.glove_voc.update(voc) input_dial.glove_vec = vec
def __init__(self, path, glove_path="./glove.6B.50d.txt"): self.reader = filereader.FileReader(path) self.pattern = re.compile('[^ \w]+') self.userdict = UserDict() print("Reading word vectors...") self.vocab, self.vec = torchwordemb.load_glove_text(glove_path) print("Collecting user IDs ...") for i in range(len(self.reader)): line = self.reader[i] data = json.loads(line) user = data["user_id"] userid = self.userdict.lookup(user) print("Done!")
def main(): with open('../data/cb-small/instances.jsonl', 'r') as f: data = [] for i in range(300): obj = json.loads(f.readline()) entry = {'targetParagraphs': obj['targetParagraphs']} data.append(entry) with open('word_vec_test.json', 'w') as f: for entry in data: f.write(json.dumps(entry) + '\n') vocab, emb = torchwordemb.load_glove_text('glove.6B.50d.txt') #vocab = None #emb = None with open('word_vec_test.json', 'r') as f: inputs = [json.loads(line) for line in f] get_word_ids(inputs, vocab, emb)
filename='/media/storage/word_vectors/checkpoint' + args.version + '.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile( filename, '/media/storage/word_vectors/model_best' + args.version + '.pth.tar') torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) path = args.path_word_vectors vocab, vec = torchwordemb.load_glove_text(path) word_vec_dataset = torch.utils.data.TensorDataset(vec, vec) train_loader = torch.utils.data.DataLoader(word_vec_dataset, shuffle=True, batch_size=args.batch_size, num_workers=10) test_loader = torch.utils.data.DataLoader(word_vec_dataset, shuffle=False, batch_size=args.test_batch_size, num_workers=10) inverse_vocab = dict() for k in vocab.keys(): inverse_vocab[vocab[k]] = k class Encoder(nn.Module):
df = pd.read_csv(input_path, sep='\t', header=None, encoding="ISO-8859-1") train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True) #X_train = (train.to_frame().T) X_train = train[2] y_train = train[1] X_test = test[2] y_test = test[1] ## Load pretrained word vector if args['dim'] == 50: print('loading 50d glove embedding') vocab, vec = torchwordemb.load_glove_text( "/diskA/animesh/glove/glove.6B.50d.txt") elif args['dim'] == 100: print('loading 100d glove embedding') vocab, vec = torchwordemb.load_glove_text( "/diskA/animesh/glove/glove.6B.100d.txt") elif args['dim'] == 200: print('loading 200d glove embedding') vocab, vec = torchwordemb.load_glove_text( "/diskA/animesh/glove/glove.6B.200d.txt") elif args['dim'] == 300: print('loading 300d glove embedding') vocab, vec = torchwordemb.load_glove_text( "/diskA/animesh/glove/glove.6B.300d.txt") else: print("Embedding dimension not available. Defaulting to 50 dimensions") vocab, vec = torchwordemb.load_glove_text(
import utils import settings import re import torchwordemb from live_sentiment import text2vec if __name__ == "__main__": # Load model model = utils.generate_model_from_settings() utils.load_model_params(model, settings.args.load_path) # Load glove print("Reading word vectors...") vocab, vec = torchwordemb.load_glove_text( settings.DATA_KWARGS["glove_path"]) print("Done!") while True: text = input("What's on your mind? \n") features = text2vec(text, vocab, vec) features = utils.pack_sequence([features]) (features, lengths) = torch.nn.utils.rnn.pad_packed_sequence(features) out = model(features, lengths) stars = float(out[0, 0, 0]) if stars < 1.1: print("Watch your language, kid") print("Your mind has the following rating: {}".format(stars))
# word embeddings # preparing ngrams of size n = 6 ngrams = [([train_data[i], train_data[i + 1], train_data[i + 2], train_data[i + 3], train_data[i + 4]], train_data[i + 5]) for i in range(len(train_data) - 5)] ngrams_valid = [([valid_data[i], valid_data[i + 1], valid_data[i + 2], valid_data[i + 3], valid_data[i + 4]], valid_data[i + 5]) for i in range(len(valid_data) - 5)] # load embeddings try: vocab, vec = torchwordemb.load_glove_text("../embeddings/glove.6b/glove.6B.50d.txt") except FileNotFoundError: vocab, vec = torchwordemb.load_glove_text("./embeddings/glove.6b/glove.6B.50d.txt") # vocab of treebank vocab_tb = data.dictionary.word2idx.keys() # mean vec of all embeddings mean_vec = torch.mean(vec, 0).view(1, 50) # mean vec for digits numvec = vec[vocab["0"], :].view(1, 50) numvec = torch.cat((vec[vocab["1"], :].view(1, 50), numvec), 0) numvec = torch.cat((vec[vocab["2"], :].view(1, 50), numvec), 0) numvec = torch.cat((vec[vocab["3"], :].view(1, 50), numvec), 0)
def make_dataset(categories, validation, test, cnn_style=False, word2vec=False, train_frac=None): """ Makes a dataset out of the given categories and validation and test proportions. Args: categories: list of strings corresponding to files, e.g. 'dog' for file 'dog.npy'. validation: float in (0, 1), proportion of validation examples. test: float in (0, 1), proportion of test examples. cnn_style: if True, output shape will be [batch, channel, row, col] word2vec: if given, also return an [n, e] array where n is the number of classes and e is the dimension of the embedding train_frac: if provided, only take this fraction of training examples. """ X_train = [] y_train = [] X_valid = [] y_valid = [] X_test = [] y_test = [] for i, cat in enumerate(categories): images = load_images(cat + ".npy", cnn_style=cnn_style) num_test = int(test * len(images)) num_valid = int(validation * len(images)) num_train = len(images) - num_valid - num_test X_train.extend(images[:num_train]) X_valid.extend(images[num_train:(num_train + num_valid)]) X_test.extend(images[(num_train + num_valid):]) y_train.extend([i] * num_train) y_valid.extend([i] * num_valid) y_test.extend([i] * num_test) def shuffled_arrays(X, y): X = np.array(X) y = np.array(y) assert (len(X) == len(y)) order = np.arange(len(X)) np.random.shuffle(order) return X[order], y[order] X_train, y_train = shuffled_arrays(X_train, y_train) X_valid, y_valid = shuffled_arrays(X_valid, y_valid) X_test, y_test = shuffled_arrays(X_test, y_test) mean = X_train.mean() std = X_train.std() X_train = (X_train - mean) / std X_valid = (X_valid - mean) / std X_test = (X_test - mean) / std if train_frac: num_train = int(train_frac * len(X_train)) X_train = X_train[:num_train] y_train = y_train[:num_train] if word2vec: vocab, vec = torchwordemb.load_glove_text("glove.6B.100d.txt") emb = np.zeros((len(categories), 100)) for i, cat in enumerate(categories): pos = vocab[cat] emb[i] = vec[pos] return X_train, y_train, X_valid, y_valid, X_test, y_test, emb else: return X_train, y_train, X_valid, y_valid, X_test, y_test
def __init__(self, path, glove_path="./glove.6B.50d.txt"): self.reader = filereader.FileReader(path) self.pattern = re.compile('[^ \w]+') print("Reading word vectors...") self.vocab, self.vec = torchwordemb.load_glove_text(glove_path) print("Done!")
def load_glove_vecs(path): vocab, vec = torchwordemb.load_glove_text(path) return vocab, vec