class Word2vecUtils(): def __init__(self): super(Word2vecUtils, self).__init__() self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300) self.initializer = lambda: np.random.normal(size=300).tolist() def load_embeddings(self, module, vocab, device='cpu'): """ Initialize the embedding with glove and char embedding """ emb_size = module.weight.data.size(-1) assert emb_size == 300, 'Embedding size is not 300, cannot be initialized by GLOVE' outliers = 0 for word in vocab.word2id: if word == PAD: # PAD symbol is always 0-vector module.weight.data[vocab[PAD]] = torch.zeros(emb_size, dtype=torch.float, device=device) continue word_emb = self.word_embed.emb(word, default='none') if word_emb[0] is None: # oov word_emb = self.initializer() outliers += 1 module.weight.data[vocab[word]] = torch.tensor(word_emb, dtype=torch.float, device=device) return 1 - outliers / float(len(vocab)) def emb(self, word): word_emb = self.word_embed.emb(word, default='none') if word_emb[0] is None: return None else: return word_emb
def load_embedding(self): glove = GloveEmbedding() kazuma = KazumaCharEmbedding() embed = self.context_encoder.embedding.weight.data for word, idx in self.vocab.word2idx.items(): embed[idx] = torch.tensor( glove.emb(word, default="zero") + kazuma.emb(word, default="zero"))
def init_word_embeddings(embed_file_name, word_set, edim): embeddings = {} tokens = embed_file_name.split('-') embedding = None if tokens[0] == 'glove': embedding = GloveEmbedding(tokens[1], d_emb=edim, show_progress=True) if embedding: for word in word_set: emb = embedding.emb(word) if emb is not None: embeddings[word] = emb return embeddings
def get_pretrained_embeddings(dataset, words, slots, intents): vocab = set(words + slots + intents) for symbol in [BOS, EOS, UNK, EQUAL]: vocab.add(symbol) # GK Embedding word_embed, char_embed = GloveEmbedding( default='zero'), KazumaCharEmbedding() embed_size = word_embed.d_emb + char_embed.d_emb progress = 0 with open(EMBEDDING(dataset), 'w') as out_file: for word in vocab: progress += 1 vector = word_embed.emb(word) + char_embed.emb(word) string = ' '.join([str(v) for v in vector]) out_file.write(word + ' ' + string + '\n') if progress % 1000 == 0: print("Retrieve 400-dim GK Embedding for the", progress, "-th word ...") print('In total, process %d words in %s' % (len(vocab), dataset))
def cluster_kmeans_glove(sorted_sent_ids, id_sentence_map, num_clusters): d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) vecs = np.zeros(shape=(len(sorted_sent_ids), 300)) for idx, sent_id in enumerate(sorted_sent_ids): for token, _ in id_sentence_map[sent_id]: vecs[idx] += np.array(embeddings.emb(token.lower(), "zero")) vecs[idx] /= len(id_sentence_map[sent_id]) if num_clusters is None: num_clusters = max(len(id_sentence_map) // 25, 2) #clusterer = AgglomerativeClustering(n_clusters=num_clusters) #clustering = clusterer.fit_predict(vecs) clustering = KMeans(n_clusters=num_clusters).fit_predict(vecs) return clustering
def gen_slot_embed_for_each_dom_from_glove(dom2slots, slot2desc, save_file): ## 1. generate slot2embs slots = list(sorted(slot2desc.keys())) desps = [slot2desc[k] for k in slots] word2emb = {} # collect words for des in desps: splits = des.split() for word in splits: if word not in word2emb: word2emb[word] = [] # load embeddings glove_emb = GloveEmbedding() # calculate slot embs slot2embs = {} for i, slot in enumerate(slots): word_list = slot2desc[slot].split() embs = np.zeros(300) for word in word_list: embs = embs + glove_emb.emb(word, default='zero') slot2embs[slot] = embs ## 2. generate slot2embs based on each domain slot_embs_based_on_each_domain = {} for domain, slot_names in dom2slots.items(): slot_embs = np.zeros((len(slot_names), 300)) for i, slot in enumerate(slot_names): embs = slot2embs[slot] slot_embs[i] = embs slot_embs_based_on_each_domain[domain] = slot_embs with open(save_file, "wb") as f: pickle.dump(slot_embs_based_on_each_domain, f) return slot2embs
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding import numpy as np from read_rules import read_rul, read_csv g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') k = KazumaCharEmbedding() c = ConcatEmbedding([g, k]) for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']: word = np.array(g.emb(w)) word1 = np.array(k.emb(w)) if None in word: print(w, ":\tbad embedding") else: print(w, ":\tgood embedding") out = np.append(word1, word) print(out.shape) diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1')) diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer')) # print(np.abs(np.mean(diff1))) # print(np.abs(np.mean(diff2))) pdk15_csv = read_csv("calibreDRC_15.csv") pdk45_csv = read_csv("calibreDRC_45.csv")
dev = TabularDataset(path=args.dev_data, format='tsv', fields=[('src', srcF), ('tgt', tgtF), ('tgt_be', tgt_beF), ('dis', disF), ('label', labelF)]) tgt_beF.build_vocab(all_data, min_freq=1) disF.build_vocab(all_data, min_freq=1) srcF.build_vocab(all_data, min_freq=1) vocab = srcF.vocab tgtF.vocab = vocab args.vocab_size = len(vocab) g = GloveEmbedding('common_crawl_840', d_emb=300) embedding = [] for i in range(len(vocab)): if not g.emb(vocab.itos[i])[0]: embedding.append(np.random.uniform(-0.25, 0.25, size=(1, 300))[0]) else: embedding.append(np.array(g.emb(vocab.itos[i]))) embedding = np.array(embedding, dtype=np.float32) args.pre_embedding = True args.embedding = embedding args.update_embedding = False print('build batch iterator...') train_batch_iterator = BucketIterator(dataset=train, batch_size=args.batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), repeat=False)
"").replace("'", "").split(", "): tokens_as_string = tokens_as_string + tok + " " tweets[idx] = tokens_as_string idx = idx + 1 fp.close() """ Feature Engineering - add word embeddings of all words in a sentence -> 50 featured vector """ samples_list = list() labels_list = list() for idx, tweet in tweets.items(): sample = np.zeros([50, 1], dtype=np.float32) for tok in tweet.split(" "): embd = glove.emb(tok) if None in embd: embd = np.zeros([50, 1], dtype=np.float32) else: embd = np.asarray(embd) embd = embd.reshape([50, 1]) sample = sample + embd x = [[idx]] x.extend(sample.tolist()) samples_list.append(x) labels_list.append(labels[idx]) data_set = [np.asarray(samples_list).squeeze(), np.asarray(labels_list)] print("No of samples X No of features:", data_set[0].shape) print("No of samples X 1:", data_set[1].shape)
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True) f = FastTextEmbedding() k = KazumaCharEmbedding() c = ConcatEmbedding([g, f, k]) for w in ['canada', 'vancouver', 'toronto']: print('embedding {}'.format(w)) print(g.emb(w)) print(f.emb(w)) print(k.emb(w)) print(c.emb(w))
class RuleEmbedding: def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals'] ############################################################################### # Concatenates char and glove embeddings ############################################################################### def concatEmbed(self, word): one = np.array(self.k.emb(word)) two = np.array(self.g.emb(word)) return np.append(one, two) ############################################################################### # embed_sentence(): # Returns list of embeddings for the provided sentences # If self.word_counts != None, computes a weighted average of the word embeddings # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF ############################################################################### def embed_sentence(self, text): embeddings = [] N = len(text) for i in range(N): sentence = text[i] words = sentence.split(' ') num_words = len(words) total = np.zeros(self.size) for i in range(num_words): w = words[i].strip() # remove numbers if self.number_replacement and w.replace('.', '', 1).isdigit(): w = self.number_replacement embed = np.array(self.wordEmbed(w)) # add weight to words that are all caps if self.weigh_capitals and w.isalpha() and w.isupper(): embed = self.weigh_capitals * embed # weigh words based on inverse of probability if self.word_counts and w in self.word_counts.keys(): prob = self.word_counts[w] / self.word_counts['total-words'] weight = self.a / (self.a + prob) embed = weight * embed total += embed result = total / num_words embeddings.append(result) return embeddings ############################################################################### # embed_key(): # Returns a matrix of sentence embeddings for the designated rule feature. # This can be "rule", "description", layer, name, etc. # Embedding type is set by self.embedding_type ############################################################################### def embed_key(self, key): pdk = self.pdk N = len(pdk) sentences = [] for i in range(N): # in case we embed a feature like name, which is not a list if isinstance(pdk[i][key], list): s = ' '.join(pdk[i][key]) else: s = pdk[i][key] sentences.append(s) result = np.array(self.sentenceEmbed(sentences)) return result ############################################################################### # embed_all(): # Compute rule embeddings using a weighted sum of the features. # Weights are stored in self.weights and features are stored in self.features. # Remove first principle component if self.useSIF == True ############################################################################### def embed_all(self): num_features = len(self.features) N = len(self.pdk) partial_embed = np.zeros((num_features, N, self.size)) for i in range(num_features): result = self.embed_key(self.features[i]) # remove first principle component if self.remove_pc: emb = remove_pc(result, 1) partial_embed[i] = emb else: partial_embed[i] = result # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2]) output = np.tensordot(partial_embed, self.weights, axes=(0, 0)) return output
import sys from embeddings import GloveEmbedding if len(sys.argv) < 3: print("please provide embeddings and pos conl file") exit(0) embs = GloveEmbedding(sys.argv[1], default="random") unk = "<UNK>" outFile = open(sys.argv[2] + ".glove", "w") curSent = "" for line in open(sys.argv[2]): if len(line) < 2: outFile.write(curSent + "\n") curSent = "" else: tok = line.strip().split("\t") emb = embs.emb(tok[0]) embStr = "emb=" + ",".join([str(x) for x in emb]) curSent += "\t".join(tok + [embStr]) + "\n" outFile.close()