def get_embedding_layer(tokenizer): word_index = tokenizer.word_index num_words = len(word_index) + 1 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) print('about to get kz') kz = KazumaCharEmbedding() print('got kz') for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) if embedding_vector is not None: if sum(embedding_vector) == 0: print("failed to find embedding for:" + word) # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector print("Number of words:" + str(num_words)) embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) return embedding_layer
def load_embedding(self): glove = GloveEmbedding() kazuma = KazumaCharEmbedding() embed = self.context_encoder.embedding.weight.data for word, idx in self.vocab.word2idx.items(): embed[idx] = torch.tensor( glove.emb(word, default="zero") + kazuma.emb(word, default="zero"))
def get_embeddings(self): num_words = len(self.word2idx) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) print('about to get kz') kz = KazumaCharEmbedding() print('got kz') for word, i in self.word2idx.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) if embedding_vector is not None: if sum(embedding_vector) == 0: print("failed to find embedding for:" + word) # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector self.idx_to_embedding = embedding_matrix
def process_raw_dataset( self, models_path="models", train_path: Text = None, dev_path: Text = None, test_path: Text = None, ): """ data path Args: train_path: dev_path: test_path: models_path: Returns: """ if not os.path.isdir(models_path): os.makedirs(models_path) splits_path = {} if train_path: splits_path.update({'train': train_path}) if dev_path: splits_path.update({'dev': dev_path}) if test_path: splits_path.update({'test': test_path}) for name, path in splits_path.items(): self.dataset[name] = Dataset.annotate_raw(path) self.dataset[name].numericalize_(self.vocab) self.ontology += self.dataset[name].extract_ontology() ann_path = path[:-5] + "_ann.json" with open(ann_path, 'wt') as f: json.dump(self.dataset[name].to_dict(), f, indent=4) self.ontology.numericalize_(self.vocab) with open(os.path.join(models_path, 'ontology.json'), 'wt') as f: json.dump(self.ontology.to_dict(), f, indent=4) with open(os.path.join(models_path, 'vocab.json'), 'wt') as f: json.dump(self.vocab.to_dict(), f, indent=4) # Generate embedding file embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for w in tqdm(self.vocab._index2word): e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) self.embeddings = E with open(os.path.join(models_path, 'emb.json'), 'wt') as f: json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in tqdm(range(len(word2index.keys()))): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") os.environ["HOME"] = "D:/ANAHOME" # add HOME directory temporarily embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in tqdm(range(len(word2index.keys()))): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") # import ssl # ssl._create_default_https_context = ssl._create_unverified_context embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in range(len(word2index.keys())): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def dump_pretrained_emb_new(tokenizer, dump_path): print("Dumping pretrained embeddings...") embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] word_list = [] for w, i in sorted(tokenizer.vocab.items(), key=lambda i: i[1]): word_list.append(w) for i in tqdm(range(len(word_list))): w = word_list[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def get_pretrained_embeddings(dataset, words, slots, intents): vocab = set(words + slots + intents) for symbol in [BOS, EOS, UNK, EQUAL]: vocab.add(symbol) # GK Embedding word_embed, char_embed = GloveEmbedding( default='zero'), KazumaCharEmbedding() embed_size = word_embed.d_emb + char_embed.d_emb progress = 0 with open(EMBEDDING(dataset), 'w') as out_file: for word in vocab: progress += 1 vector = word_embed.emb(word) + char_embed.emb(word) string = ' '.join([str(v) for v in vector]) out_file.write(word + ' ' + string + '\n') if progress % 1000 == 0: print("Retrieve 400-dim GK Embedding for the", progress, "-th word ...") print('In total, process %d words in %s' % (len(vocab), dataset))
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding import numpy as np from read_rules import read_rul, read_csv g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') k = KazumaCharEmbedding() c = ConcatEmbedding([g, k]) for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']: word = np.array(g.emb(w)) word1 = np.array(k.emb(w)) if None in word: print(w, ":\tbad embedding") else: print(w, ":\tgood embedding") out = np.append(word1, word) print(out.shape) diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1')) diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer')) # print(np.abs(np.mean(diff1))) # print(np.abs(np.mean(diff2))) pdk15_csv = read_csv("calibreDRC_15.csv") pdk45_csv = read_csv("calibreDRC_45.csv")
if not os.path.isdir(dann): os.makedirs(dann) dataset = {} ontology = Ontology() vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warn('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) ontology = ontology + dataset[s].extract_ontology() with open(os.path.join(dann, fname), 'wt') as f: json.dump(dataset[s].to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dann, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dann, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f) logging.warn('Computing word embeddings') embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for w in tqdm(vocab._index2word): e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(os.path.join(dann, 'emb.json'), 'wt') as f: json.dump(E, f)
texts1 = texts1[indices] texts2 = texts2[indices] texts3 = texts3[indices] print('Preparing embedding matrix.') # prepare embedding matrix # num_words = min(MAX_NB_WORDS, len(word_index)) num_words = len(word_index) + 1 # word_index is indexed from 1-N embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) kz = KazumaCharEmbedding() for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) # i = 0 # while sum(embedding_vector) == 0 and i <= 1000: # embedding_vector = k.emb(word) # i++; # if i == 1000: # print("fail") if embedding_vector is not None:
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True) f = FastTextEmbedding() k = KazumaCharEmbedding() c = ConcatEmbedding([g, f, k]) for w in ['canada', 'vancouver', 'toronto']: print('embedding {}'.format(w)) print(g.emb(w)) print(f.emb(w)) print(k.emb(w)) print(c.emb(w))
def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals']
class RuleEmbedding: def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals'] ############################################################################### # Concatenates char and glove embeddings ############################################################################### def concatEmbed(self, word): one = np.array(self.k.emb(word)) two = np.array(self.g.emb(word)) return np.append(one, two) ############################################################################### # embed_sentence(): # Returns list of embeddings for the provided sentences # If self.word_counts != None, computes a weighted average of the word embeddings # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF ############################################################################### def embed_sentence(self, text): embeddings = [] N = len(text) for i in range(N): sentence = text[i] words = sentence.split(' ') num_words = len(words) total = np.zeros(self.size) for i in range(num_words): w = words[i].strip() # remove numbers if self.number_replacement and w.replace('.', '', 1).isdigit(): w = self.number_replacement embed = np.array(self.wordEmbed(w)) # add weight to words that are all caps if self.weigh_capitals and w.isalpha() and w.isupper(): embed = self.weigh_capitals * embed # weigh words based on inverse of probability if self.word_counts and w in self.word_counts.keys(): prob = self.word_counts[w] / self.word_counts['total-words'] weight = self.a / (self.a + prob) embed = weight * embed total += embed result = total / num_words embeddings.append(result) return embeddings ############################################################################### # embed_key(): # Returns a matrix of sentence embeddings for the designated rule feature. # This can be "rule", "description", layer, name, etc. # Embedding type is set by self.embedding_type ############################################################################### def embed_key(self, key): pdk = self.pdk N = len(pdk) sentences = [] for i in range(N): # in case we embed a feature like name, which is not a list if isinstance(pdk[i][key], list): s = ' '.join(pdk[i][key]) else: s = pdk[i][key] sentences.append(s) result = np.array(self.sentenceEmbed(sentences)) return result ############################################################################### # embed_all(): # Compute rule embeddings using a weighted sum of the features. # Weights are stored in self.weights and features are stored in self.features. # Remove first principle component if self.useSIF == True ############################################################################### def embed_all(self): num_features = len(self.features) N = len(self.pdk) partial_embed = np.zeros((num_features, N, self.size)) for i in range(num_features): result = self.embed_key(self.features[i]) # remove first principle component if self.remove_pc: emb = remove_pc(result, 1) partial_embed[i] = emb else: partial_embed[i] = result # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2]) output = np.tensordot(partial_embed, self.weights, axes=(0, 0)) return output