Esempio n. 1
0
class Word2vecUtils():
    def __init__(self):
        super(Word2vecUtils, self).__init__()
        self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300)
        self.initializer = lambda: np.random.normal(size=300).tolist()

    def load_embeddings(self, module, vocab, device='cpu'):
        """ Initialize the embedding with glove and char embedding
        """
        emb_size = module.weight.data.size(-1)
        assert emb_size == 300, 'Embedding size is not 300, cannot be initialized by GLOVE'
        outliers = 0
        for word in vocab.word2id:
            if word == PAD:  # PAD symbol is always 0-vector
                module.weight.data[vocab[PAD]] = torch.zeros(emb_size,
                                                             dtype=torch.float,
                                                             device=device)
                continue
            word_emb = self.word_embed.emb(word, default='none')
            if word_emb[0] is None:  # oov
                word_emb = self.initializer()
                outliers += 1
            module.weight.data[vocab[word]] = torch.tensor(word_emb,
                                                           dtype=torch.float,
                                                           device=device)
        return 1 - outliers / float(len(vocab))

    def emb(self, word):
        word_emb = self.word_embed.emb(word, default='none')
        if word_emb[0] is None:
            return None
        else:
            return word_emb
Esempio n. 2
0
 def load_embedding(self):
     glove = GloveEmbedding()
     kazuma = KazumaCharEmbedding()
     embed = self.context_encoder.embedding.weight.data
     for word, idx in self.vocab.word2idx.items():
         embed[idx] = torch.tensor(
             glove.emb(word, default="zero") +
             kazuma.emb(word, default="zero"))
Esempio n. 3
0
def init_word_embeddings(embed_file_name, word_set, edim):
    embeddings = {}

    tokens = embed_file_name.split('-')
    embedding = None

    if tokens[0] == 'glove':
       embedding = GloveEmbedding(tokens[1], d_emb=edim, show_progress=True)

    if embedding:
       for word in word_set:
          emb = embedding.emb(word)
          if emb is not None:
             embeddings[word] = emb
    return embeddings
Esempio n. 4
0
def get_pretrained_embeddings(dataset, words, slots, intents):
    vocab = set(words + slots + intents)
    for symbol in [BOS, EOS, UNK, EQUAL]:
        vocab.add(symbol)

    # GK Embedding
    word_embed, char_embed = GloveEmbedding(
        default='zero'), KazumaCharEmbedding()
    embed_size = word_embed.d_emb + char_embed.d_emb
    progress = 0
    with open(EMBEDDING(dataset), 'w') as out_file:
        for word in vocab:
            progress += 1
            vector = word_embed.emb(word) + char_embed.emb(word)
            string = ' '.join([str(v) for v in vector])
            out_file.write(word + ' ' + string + '\n')
            if progress % 1000 == 0:
                print("Retrieve 400-dim GK Embedding for the", progress,
                      "-th word ...")
    print('In total, process %d words in %s' % (len(vocab), dataset))
def cluster_kmeans_glove(sorted_sent_ids, id_sentence_map, num_clusters):
    d_emb = 300
    embeddings = GloveEmbedding('common_crawl_840',
                                d_emb=d_emb,
                                show_progress=True)

    vecs = np.zeros(shape=(len(sorted_sent_ids), 300))
    for idx, sent_id in enumerate(sorted_sent_ids):
        for token, _ in id_sentence_map[sent_id]:
            vecs[idx] += np.array(embeddings.emb(token.lower(), "zero"))
        vecs[idx] /= len(id_sentence_map[sent_id])

    if num_clusters is None:
        num_clusters = max(len(id_sentence_map) // 25, 2)

    #clusterer = AgglomerativeClustering(n_clusters=num_clusters)

    #clustering = clusterer.fit_predict(vecs)
    clustering = KMeans(n_clusters=num_clusters).fit_predict(vecs)

    return clustering
Esempio n. 6
0
def gen_slot_embed_for_each_dom_from_glove(dom2slots, slot2desc, save_file):
    ## 1. generate slot2embs
    slots = list(sorted(slot2desc.keys()))
    desps = [slot2desc[k] for k in slots]
    word2emb = {}
    # collect words
    for des in desps:
        splits = des.split()
        for word in splits:
            if word not in word2emb:
                word2emb[word] = []
    
    # load embeddings
    glove_emb = GloveEmbedding()

    # calculate slot embs
    slot2embs = {}
    for i, slot in enumerate(slots):
        word_list = slot2desc[slot].split()
        embs = np.zeros(300)
        for word in word_list:
            embs = embs + glove_emb.emb(word, default='zero')
        slot2embs[slot] = embs

    ## 2. generate slot2embs based on each domain
    slot_embs_based_on_each_domain = {}
    for domain, slot_names in dom2slots.items():
        slot_embs = np.zeros((len(slot_names), 300))
        for i, slot in enumerate(slot_names):
            embs = slot2embs[slot]
            slot_embs[i] = embs
        slot_embs_based_on_each_domain[domain] = slot_embs
    
    with open(save_file, "wb") as f:
        pickle.dump(slot_embs_based_on_each_domain, f)
    return slot2embs
Esempio n. 7
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding
import numpy as np

from read_rules import read_rul, read_csv

g = GloveEmbedding('common_crawl_840',
                   d_emb=300,
                   show_progress=True,
                   default='zero')
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, k])

for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']:
    word = np.array(g.emb(w))
    word1 = np.array(k.emb(w))
    if None in word:
        print(w, ":\tbad embedding")
    else:
        print(w, ":\tgood embedding")
    out = np.append(word1, word)
    print(out.shape)

diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1'))
diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer'))

# print(np.abs(np.mean(diff1)))
# print(np.abs(np.mean(diff2)))

pdk15_csv = read_csv("calibreDRC_15.csv")
pdk45_csv = read_csv("calibreDRC_45.csv")
Esempio n. 8
0
dev = TabularDataset(path=args.dev_data,
                     format='tsv',
                     fields=[('src', srcF), ('tgt', tgtF), ('tgt_be', tgt_beF),
                             ('dis', disF), ('label', labelF)])

tgt_beF.build_vocab(all_data, min_freq=1)
disF.build_vocab(all_data, min_freq=1)
srcF.build_vocab(all_data, min_freq=1)
vocab = srcF.vocab
tgtF.vocab = vocab
args.vocab_size = len(vocab)

g = GloveEmbedding('common_crawl_840', d_emb=300)
embedding = []
for i in range(len(vocab)):
    if not g.emb(vocab.itos[i])[0]:
        embedding.append(np.random.uniform(-0.25, 0.25, size=(1, 300))[0])
    else:
        embedding.append(np.array(g.emb(vocab.itos[i])))
embedding = np.array(embedding, dtype=np.float32)
args.pre_embedding = True
args.embedding = embedding
args.update_embedding = False

print('build batch iterator...')
train_batch_iterator = BucketIterator(dataset=train,
                                      batch_size=args.batch_size,
                                      sort=False,
                                      sort_within_batch=True,
                                      sort_key=lambda x: len(x.src),
                                      repeat=False)
Esempio n. 9
0
                                                 "").replace("'",
                                                             "").split(", "):
        tokens_as_string = tokens_as_string + tok + " "
    tweets[idx] = tokens_as_string
    idx = idx + 1
fp.close()
"""
    Feature Engineering
        - add word embeddings of all words in a sentence -> 50 featured vector
"""
samples_list = list()
labels_list = list()
for idx, tweet in tweets.items():
    sample = np.zeros([50, 1], dtype=np.float32)
    for tok in tweet.split(" "):
        embd = glove.emb(tok)
        if None in embd:
            embd = np.zeros([50, 1], dtype=np.float32)
        else:
            embd = np.asarray(embd)
            embd = embd.reshape([50, 1])
        sample = sample + embd
    x = [[idx]]
    x.extend(sample.tolist())
    samples_list.append(x)
    labels_list.append(labels[idx])

data_set = [np.asarray(samples_list).squeeze(), np.asarray(labels_list)]

print("No of samples X No of features:", data_set[0].shape)
print("No of samples X 1:", data_set[1].shape)
Esempio n. 10
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding

g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True)
f = FastTextEmbedding()
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, f, k])
for w in ['canada', 'vancouver', 'toronto']:
    print('embedding {}'.format(w))
    print(g.emb(w))
    print(f.emb(w))
    print(k.emb(w))
    print(c.emb(w))
Esempio n. 11
0
class RuleEmbedding:
    def __init__(self, embedding_type, inputs):
        if embedding_type == "char":
            k = KazumaCharEmbedding()
            self.wordEmbed = k.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 100
        elif embedding_type == "glove":
            g = GloveEmbedding('common_crawl_840',
                               d_emb=300,
                               show_progress=True,
                               default='zero')
            self.wordEmbed = g.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 300
        elif embedding_type == "concat":
            self.k = KazumaCharEmbedding()
            self.g = GloveEmbedding('common_crawl_840',
                                    d_emb=300,
                                    show_progress=True,
                                    default='zero')
            self.wordEmbed = self.concatEmbed
            self.sentenceEmbed = self.embed_sentence
            self.size = 400
        # elif embedding_type == "bert":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "bert-stsb":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "universal":
        #     try:
        #         univEmbed = hub.load("./src/universal-sentence-encoder_4")
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = univEmbed
        #     self.size = 512

        else:
            print("Error: Embedding type \"%s\" not recognized" %
                  embedding_type)
            print(
                "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\""
            )
            exit(1)

        self.type = embedding_type
        self.pdk = inputs['pdk']
        self.features = inputs['features']
        self.weights = inputs['weights']
        self.word_counts = inputs['word_counts']
        self.a = inputs['a']
        self.number_replacement = inputs['number_replacement']
        self.remove_pc = inputs['remove_pc']
        self.weigh_capitals = inputs['weigh_capitals']

    ###############################################################################
    # Concatenates char and glove embeddings
    ###############################################################################
    def concatEmbed(self, word):
        one = np.array(self.k.emb(word))
        two = np.array(self.g.emb(word))
        return np.append(one, two)

    ###############################################################################
    # embed_sentence():
    # Returns list of embeddings for the provided sentences
    # If self.word_counts != None, computes a weighted average of the word embeddings
    # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF
    ###############################################################################
    def embed_sentence(self, text):
        embeddings = []
        N = len(text)
        for i in range(N):
            sentence = text[i]
            words = sentence.split(' ')
            num_words = len(words)
            total = np.zeros(self.size)

            for i in range(num_words):
                w = words[i].strip()

                # remove numbers
                if self.number_replacement and w.replace('.', '', 1).isdigit():
                    w = self.number_replacement

                embed = np.array(self.wordEmbed(w))

                # add weight to words that are all caps
                if self.weigh_capitals and w.isalpha() and w.isupper():
                    embed = self.weigh_capitals * embed

                # weigh words based on inverse of probability
                if self.word_counts and w in self.word_counts.keys():
                    prob = self.word_counts[w] / self.word_counts['total-words']
                    weight = self.a / (self.a + prob)
                    embed = weight * embed

                total += embed

            result = total / num_words
            embeddings.append(result)
        return embeddings

    ###############################################################################
    # embed_key():
    # Returns a matrix of sentence embeddings for the designated rule feature.
    # This can be "rule", "description", layer, name, etc.
    # Embedding type is set by self.embedding_type
    ###############################################################################
    def embed_key(self, key):
        pdk = self.pdk
        N = len(pdk)

        sentences = []
        for i in range(N):
            # in case we embed a feature like name, which is not a list
            if isinstance(pdk[i][key], list):
                s = ' '.join(pdk[i][key])
            else:
                s = pdk[i][key]
            sentences.append(s)

        result = np.array(self.sentenceEmbed(sentences))
        return result

    ###############################################################################
    # embed_all():
    # Compute rule embeddings using a weighted sum of the features.
    # Weights are stored in self.weights and features are stored in self.features.
    # Remove first principle component if self.useSIF == True
    ###############################################################################
    def embed_all(self):
        num_features = len(self.features)
        N = len(self.pdk)

        partial_embed = np.zeros((num_features, N, self.size))

        for i in range(num_features):
            result = self.embed_key(self.features[i])

            # remove first principle component
            if self.remove_pc:
                emb = remove_pc(result, 1)
                partial_embed[i] = emb
            else:
                partial_embed[i] = result

        # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2])
        output = np.tensordot(partial_embed, self.weights, axes=(0, 0))
        return output
Esempio n. 12
0
import sys
from embeddings import GloveEmbedding

if len(sys.argv) < 3:
    print("please provide embeddings and pos conl file")
    exit(0)

embs = GloveEmbedding(sys.argv[1], default="random")

unk = "<UNK>"

outFile = open(sys.argv[2] + ".glove", "w")
curSent = ""
for line in open(sys.argv[2]):
    if len(line) < 2:
        outFile.write(curSent + "\n")
        curSent = ""
    else:
        tok = line.strip().split("\t")
        emb = embs.emb(tok[0])

        embStr = "emb=" + ",".join([str(x) for x in emb])
        curSent += "\t".join(tok + [embStr]) + "\n"

outFile.close()