def get_embedding_layer(tokenizer):
    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    print('about to get kz')
    kz = KazumaCharEmbedding()
    print('got kz')
    for word, i in word_index.items():

        if i >= MAX_NB_WORDS:
            continue
        embedding_vector = kz.emb(word)

        if embedding_vector is not None:
            if sum(embedding_vector) == 0:
                print("failed to find embedding for:" + word)
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    print("Number of words:" + str(num_words))

    embedding_layer = Embedding(num_words,

                                EMBEDDING_DIM,

                                weights=[embedding_matrix],

                                input_length=MAX_SEQUENCE_LENGTH,

                                trainable=False)
    return embedding_layer
Beispiel #2
0
 def load_embedding(self):
     glove = GloveEmbedding()
     kazuma = KazumaCharEmbedding()
     embed = self.context_encoder.embedding.weight.data
     for word, idx in self.vocab.word2idx.items():
         embed[idx] = torch.tensor(
             glove.emb(word, default="zero") +
             kazuma.emb(word, default="zero"))
    def get_embeddings(self):
        num_words = len(self.word2idx)
        embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
        print('about to get kz')
        kz = KazumaCharEmbedding()
        print('got kz')

        for word, i in self.word2idx.items():
            if i >= MAX_NB_WORDS:
                continue
            embedding_vector = kz.emb(word)
            if embedding_vector is not None:
                if sum(embedding_vector) == 0:
                    print("failed to find embedding for:" + word)
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
        self.idx_to_embedding = embedding_matrix
Beispiel #4
0
    def process_raw_dataset(
            self,
            models_path="models",
            train_path: Text = None,
            dev_path: Text = None,
            test_path: Text = None,
    ):
        """
        data path

        Args:
            train_path:
            dev_path:
            test_path:
            models_path:
        Returns:

        """
        if not os.path.isdir(models_path):
            os.makedirs(models_path)
        splits_path = {}

        if train_path:
            splits_path.update({'train': train_path})
        if dev_path:
            splits_path.update({'dev': dev_path})
        if test_path:
            splits_path.update({'test': test_path})

        for name, path in splits_path.items():
            self.dataset[name] = Dataset.annotate_raw(path)
            self.dataset[name].numericalize_(self.vocab)
            self.ontology += self.dataset[name].extract_ontology()

            ann_path = path[:-5] + "_ann.json"
            with open(ann_path, 'wt') as f:
                json.dump(self.dataset[name].to_dict(), f, indent=4)

        self.ontology.numericalize_(self.vocab)
        with open(os.path.join(models_path, 'ontology.json'), 'wt') as f:
            json.dump(self.ontology.to_dict(), f, indent=4)
        with open(os.path.join(models_path, 'vocab.json'), 'wt') as f:
            json.dump(self.vocab.to_dict(), f, indent=4)

        # Generate embedding file
        embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(self.vocab._index2word):
            e = []
            for emb in embeddings:
                e += emb.emb(w, default='zero')
            E.append(e)
        self.embeddings = E
        with open(os.path.join(models_path, 'emb.json'), 'wt') as f:
            json.dump(E, f)
Beispiel #5
0
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in tqdm(range(len(word2index.keys()))):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
Beispiel #6
0
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    os.environ["HOME"] = "D:/ANAHOME"  # add HOME directory temporarily
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in tqdm(range(len(word2index.keys()))):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    # import ssl
    # ssl._create_default_https_context = ssl._create_unverified_context
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in range(len(word2index.keys())):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
def dump_pretrained_emb_new(tokenizer, dump_path):
    print("Dumping pretrained embeddings...")
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    word_list = []
    for w, i in sorted(tokenizer.vocab.items(), key=lambda i: i[1]):
        word_list.append(w)
    for i in tqdm(range(len(word_list))):
        w = word_list[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
Beispiel #9
0
def get_pretrained_embeddings(dataset, words, slots, intents):
    vocab = set(words + slots + intents)
    for symbol in [BOS, EOS, UNK, EQUAL]:
        vocab.add(symbol)

    # GK Embedding
    word_embed, char_embed = GloveEmbedding(
        default='zero'), KazumaCharEmbedding()
    embed_size = word_embed.d_emb + char_embed.d_emb
    progress = 0
    with open(EMBEDDING(dataset), 'w') as out_file:
        for word in vocab:
            progress += 1
            vector = word_embed.emb(word) + char_embed.emb(word)
            string = ' '.join([str(v) for v in vector])
            out_file.write(word + ' ' + string + '\n')
            if progress % 1000 == 0:
                print("Retrieve 400-dim GK Embedding for the", progress,
                      "-th word ...")
    print('In total, process %d words in %s' % (len(vocab), dataset))
Beispiel #10
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding
import numpy as np

from read_rules import read_rul, read_csv

g = GloveEmbedding('common_crawl_840',
                   d_emb=300,
                   show_progress=True,
                   default='zero')
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, k])

for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']:
    word = np.array(g.emb(w))
    word1 = np.array(k.emb(w))
    if None in word:
        print(w, ":\tbad embedding")
    else:
        print(w, ":\tgood embedding")
    out = np.append(word1, word)
    print(out.shape)

diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1'))
diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer'))

# print(np.abs(np.mean(diff1)))
# print(np.abs(np.mean(diff2)))

pdk15_csv = read_csv("calibreDRC_15.csv")
pdk45_csv = read_csv("calibreDRC_45.csv")
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        ontology = Ontology()
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warn('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            ontology = ontology + dataset[s].extract_ontology()
            with open(os.path.join(dann, fname), 'wt') as f:
                json.dump(dataset[s].to_dict(), f)
        ontology.numericalize_(vocab)
        with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
            json.dump(ontology.to_dict(), f)
        with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
            json.dump(vocab.to_dict(), f)

        logging.warn('Computing word embeddings')
        embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(vocab._index2word):
            e = []
            for emb in embeddings:
                e += emb.emb(w, default='zero')
            E.append(e)
        with open(os.path.join(dann, 'emb.json'), 'wt') as f:
            json.dump(E, f)
texts1 = texts1[indices]
texts2 = texts2[indices]
texts3 = texts3[indices]

print('Preparing embedding matrix.')



# prepare embedding matrix

# num_words = min(MAX_NB_WORDS, len(word_index))
num_words = len(word_index) + 1                     # word_index is indexed from 1-N

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
kz = KazumaCharEmbedding()

for word, i in word_index.items():

    if i >= MAX_NB_WORDS:

        continue
    embedding_vector = kz.emb(word)

    # i = 0
    # while sum(embedding_vector) == 0 and i <= 1000:
    #     embedding_vector = k.emb(word)
    #     i++;
    #     if i == 1000:
    #         print("fail")
    if embedding_vector is not None:
Beispiel #13
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding

g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True)
f = FastTextEmbedding()
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, f, k])
for w in ['canada', 'vancouver', 'toronto']:
    print('embedding {}'.format(w))
    print(g.emb(w))
    print(f.emb(w))
    print(k.emb(w))
    print(c.emb(w))
Beispiel #14
0
    def __init__(self, embedding_type, inputs):
        if embedding_type == "char":
            k = KazumaCharEmbedding()
            self.wordEmbed = k.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 100
        elif embedding_type == "glove":
            g = GloveEmbedding('common_crawl_840',
                               d_emb=300,
                               show_progress=True,
                               default='zero')
            self.wordEmbed = g.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 300
        elif embedding_type == "concat":
            self.k = KazumaCharEmbedding()
            self.g = GloveEmbedding('common_crawl_840',
                                    d_emb=300,
                                    show_progress=True,
                                    default='zero')
            self.wordEmbed = self.concatEmbed
            self.sentenceEmbed = self.embed_sentence
            self.size = 400
        # elif embedding_type == "bert":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "bert-stsb":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "universal":
        #     try:
        #         univEmbed = hub.load("./src/universal-sentence-encoder_4")
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = univEmbed
        #     self.size = 512

        else:
            print("Error: Embedding type \"%s\" not recognized" %
                  embedding_type)
            print(
                "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\""
            )
            exit(1)

        self.type = embedding_type
        self.pdk = inputs['pdk']
        self.features = inputs['features']
        self.weights = inputs['weights']
        self.word_counts = inputs['word_counts']
        self.a = inputs['a']
        self.number_replacement = inputs['number_replacement']
        self.remove_pc = inputs['remove_pc']
        self.weigh_capitals = inputs['weigh_capitals']
Beispiel #15
0
class RuleEmbedding:
    def __init__(self, embedding_type, inputs):
        if embedding_type == "char":
            k = KazumaCharEmbedding()
            self.wordEmbed = k.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 100
        elif embedding_type == "glove":
            g = GloveEmbedding('common_crawl_840',
                               d_emb=300,
                               show_progress=True,
                               default='zero')
            self.wordEmbed = g.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 300
        elif embedding_type == "concat":
            self.k = KazumaCharEmbedding()
            self.g = GloveEmbedding('common_crawl_840',
                                    d_emb=300,
                                    show_progress=True,
                                    default='zero')
            self.wordEmbed = self.concatEmbed
            self.sentenceEmbed = self.embed_sentence
            self.size = 400
        # elif embedding_type == "bert":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "bert-stsb":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "universal":
        #     try:
        #         univEmbed = hub.load("./src/universal-sentence-encoder_4")
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = univEmbed
        #     self.size = 512

        else:
            print("Error: Embedding type \"%s\" not recognized" %
                  embedding_type)
            print(
                "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\""
            )
            exit(1)

        self.type = embedding_type
        self.pdk = inputs['pdk']
        self.features = inputs['features']
        self.weights = inputs['weights']
        self.word_counts = inputs['word_counts']
        self.a = inputs['a']
        self.number_replacement = inputs['number_replacement']
        self.remove_pc = inputs['remove_pc']
        self.weigh_capitals = inputs['weigh_capitals']

    ###############################################################################
    # Concatenates char and glove embeddings
    ###############################################################################
    def concatEmbed(self, word):
        one = np.array(self.k.emb(word))
        two = np.array(self.g.emb(word))
        return np.append(one, two)

    ###############################################################################
    # embed_sentence():
    # Returns list of embeddings for the provided sentences
    # If self.word_counts != None, computes a weighted average of the word embeddings
    # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF
    ###############################################################################
    def embed_sentence(self, text):
        embeddings = []
        N = len(text)
        for i in range(N):
            sentence = text[i]
            words = sentence.split(' ')
            num_words = len(words)
            total = np.zeros(self.size)

            for i in range(num_words):
                w = words[i].strip()

                # remove numbers
                if self.number_replacement and w.replace('.', '', 1).isdigit():
                    w = self.number_replacement

                embed = np.array(self.wordEmbed(w))

                # add weight to words that are all caps
                if self.weigh_capitals and w.isalpha() and w.isupper():
                    embed = self.weigh_capitals * embed

                # weigh words based on inverse of probability
                if self.word_counts and w in self.word_counts.keys():
                    prob = self.word_counts[w] / self.word_counts['total-words']
                    weight = self.a / (self.a + prob)
                    embed = weight * embed

                total += embed

            result = total / num_words
            embeddings.append(result)
        return embeddings

    ###############################################################################
    # embed_key():
    # Returns a matrix of sentence embeddings for the designated rule feature.
    # This can be "rule", "description", layer, name, etc.
    # Embedding type is set by self.embedding_type
    ###############################################################################
    def embed_key(self, key):
        pdk = self.pdk
        N = len(pdk)

        sentences = []
        for i in range(N):
            # in case we embed a feature like name, which is not a list
            if isinstance(pdk[i][key], list):
                s = ' '.join(pdk[i][key])
            else:
                s = pdk[i][key]
            sentences.append(s)

        result = np.array(self.sentenceEmbed(sentences))
        return result

    ###############################################################################
    # embed_all():
    # Compute rule embeddings using a weighted sum of the features.
    # Weights are stored in self.weights and features are stored in self.features.
    # Remove first principle component if self.useSIF == True
    ###############################################################################
    def embed_all(self):
        num_features = len(self.features)
        N = len(self.pdk)

        partial_embed = np.zeros((num_features, N, self.size))

        for i in range(num_features):
            result = self.embed_key(self.features[i])

            # remove first principle component
            if self.remove_pc:
                emb = remove_pc(result, 1)
                partial_embed[i] = emb
            else:
                partial_embed[i] = result

        # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2])
        output = np.tensordot(partial_embed, self.weights, axes=(0, 0))
        return output