Example #1
0
class Word2vecUtils():
    def __init__(self):
        super(Word2vecUtils, self).__init__()
        self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300)
        self.initializer = lambda: np.random.normal(size=300).tolist()

    def load_embeddings(self, module, vocab, device='cpu'):
        """ Initialize the embedding with glove and char embedding
        """
        emb_size = module.weight.data.size(-1)
        assert emb_size == 300, 'Embedding size is not 300, cannot be initialized by GLOVE'
        outliers = 0
        for word in vocab.word2id:
            if word == PAD:  # PAD symbol is always 0-vector
                module.weight.data[vocab[PAD]] = torch.zeros(emb_size,
                                                             dtype=torch.float,
                                                             device=device)
                continue
            word_emb = self.word_embed.emb(word, default='none')
            if word_emb[0] is None:  # oov
                word_emb = self.initializer()
                outliers += 1
            module.weight.data[vocab[word]] = torch.tensor(word_emb,
                                                           dtype=torch.float,
                                                           device=device)
        return 1 - outliers / float(len(vocab))

    def emb(self, word):
        word_emb = self.word_embed.emb(word, default='none')
        if word_emb[0] is None:
            return None
        else:
            return word_emb
Example #2
0
 def load_embedding(self):
     glove = GloveEmbedding()
     kazuma = KazumaCharEmbedding()
     embed = self.context_encoder.embedding.weight.data
     for word, idx in self.vocab.word2idx.items():
         embed[idx] = torch.tensor(
             glove.emb(word, default="zero") +
             kazuma.emb(word, default="zero"))
Example #3
0
def init_word_embeddings(embed_file_name, word_set, edim):
    embeddings = {}

    tokens = embed_file_name.split('-')
    embedding = None

    if tokens[0] == 'glove':
       embedding = GloveEmbedding(tokens[1], d_emb=edim, show_progress=True)

    if embedding:
       for word in word_set:
          emb = embedding.emb(word)
          if emb is not None:
             embeddings[word] = emb
    return embeddings
def cluster_partioning_glove(sorted_sent_ids, id_sentence_map, num_clusters):
    d_emb = 300
    embeddings = GloveEmbedding('common_crawl_840',
                                d_emb=d_emb,
                                show_progress=True)
    sents = []
    for sent_id in sorted_sent_ids:
        sents.append([t[0] for t in id_sentence_map[sent_id]])
    sent_vecs = sents_to_embeddings(embeddings, d_emb, sents)

    n_vecs = 18
    base_vecs = random_unit_vecs(d_emb, n_vecs)

    sims = cosine_similarity(sent_vecs, base_vecs)

    sims = sims >= 0

    partition_ids = {}
    sent_partitions = {}
    curr_partition_id = 0

    for idx, sim_vec in enumerate(sims):
        part_id = partition_ids.get(tuple(sim_vec))
        if part_id is None:
            part_id = curr_partition_id
            curr_partition_id += 1
            partition_ids[tuple(sim_vec)] = part_id
        sent_partitions[idx] = part_id

    return sent_partitions
    def __init__(self, id_sentence_map, threshold=0.5):
        self.id_sentence_map = id_sentence_map

        sorted_sent_ids = sorted(id_sentence_map.keys())

        d_emb = 300
        embeddings = GloveEmbedding('common_crawl_840',
                                    d_emb=d_emb,
                                    show_progress=True)

        model = TfidfVectorizer(stop_words="english",
                                max_features=5000,
                                min_df=2)
        tf_idf = model.fit_transform([
            " ".join([t[0] for t in id_sentence_map[sid]])
            for sid in sorted_sent_ids
        ])

        #sent_vecs = fast_sents_to_embeddings(embeddings, d_emb, )

        self.tf_idf = tf_idf

        #self.id_vec_map = dict((sorted_sent_ids[idx], vec) for idx, vec in enumerate(sent_vecs))

        self.threshold = threshold
    def from_sentences(cls,
                       rewards,
                       doc_sents,
                       id_sentence_map,
                       normalize=False):
        #model = TfidfVectorizer(stop_words="english")
        start_time = time.time()

        sorted_sent_ids = sorted(id_sentence_map.keys())

        d_emb = 300
        embeddings = GloveEmbedding('common_crawl_840',
                                    d_emb=d_emb,
                                    show_progress=True)
        sent_vecs = fast_sents_to_embeddings(
            embeddings, d_emb,
            [s.as_token_attr_sequence("form_lowercase") for s in doc_sents])

        n_vecs = 16
        base_vecs = random_unit_vecs(d_emb, n_vecs)

        doc_sims = cosine_similarity(sent_vecs, base_vecs)
        doc_hashes = doc_sims >= 0

        logger.debug(
            "Computed doc sentences hashes (time: {}s)".format(time.time() -
                                                               start_time))
        start_time = time.time()

        buckets = Counter(map(tuple, doc_hashes))

        candidate_vecs = fast_sents_to_embeddings(
            embeddings, d_emb,
            [[t[0] for t in id_sentence_map[sid]] for sid in sorted_sent_ids])

        cand_sims = cosine_similarity(candidate_vecs, base_vecs)

        cand_hashes = [tuple(h) for h in cand_sims >= 0]

        logger.debug(
            "Computed candidate hashes (time: {}s)".format(time.time() -
                                                           start_time))
        start_time = time.time()

        precomputed_hash_sims = fast_precompute_hash_sims(cand_hashes, buckets)

        overlaps = {}
        per_cand_hashes = {}

        for sent_id, hash_ in zip(sorted_sent_ids, cand_hashes):
            #overlaps[sent_id] = precomputed_hash_sims[hash_]
            per_cand_hashes[sent_id] = hash_

        return BucketedRedundancyFactor(rewards,
                                        buckets,
                                        per_cand_hashes,
                                        precomputed_hash_sims,
                                        normalize=normalize)
Example #7
0
    def process_raw_dataset(
            self,
            models_path="models",
            train_path: Text = None,
            dev_path: Text = None,
            test_path: Text = None,
    ):
        """
        data path

        Args:
            train_path:
            dev_path:
            test_path:
            models_path:
        Returns:

        """
        if not os.path.isdir(models_path):
            os.makedirs(models_path)
        splits_path = {}

        if train_path:
            splits_path.update({'train': train_path})
        if dev_path:
            splits_path.update({'dev': dev_path})
        if test_path:
            splits_path.update({'test': test_path})

        for name, path in splits_path.items():
            self.dataset[name] = Dataset.annotate_raw(path)
            self.dataset[name].numericalize_(self.vocab)
            self.ontology += self.dataset[name].extract_ontology()

            ann_path = path[:-5] + "_ann.json"
            with open(ann_path, 'wt') as f:
                json.dump(self.dataset[name].to_dict(), f, indent=4)

        self.ontology.numericalize_(self.vocab)
        with open(os.path.join(models_path, 'ontology.json'), 'wt') as f:
            json.dump(self.ontology.to_dict(), f, indent=4)
        with open(os.path.join(models_path, 'vocab.json'), 'wt') as f:
            json.dump(self.vocab.to_dict(), f, indent=4)

        # Generate embedding file
        embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(self.vocab._index2word):
            e = []
            for emb in embeddings:
                e += emb.emb(w, default='zero')
            E.append(e)
        self.embeddings = E
        with open(os.path.join(models_path, 'emb.json'), 'wt') as f:
            json.dump(E, f)
Example #8
0
def get_pretrained_embeddings(dataset, words, slots, intents):
    vocab = set(words + slots + intents)
    for symbol in [BOS, EOS, UNK, EQUAL]:
        vocab.add(symbol)

    # GK Embedding
    word_embed, char_embed = GloveEmbedding(
        default='zero'), KazumaCharEmbedding()
    embed_size = word_embed.d_emb + char_embed.d_emb
    progress = 0
    with open(EMBEDDING(dataset), 'w') as out_file:
        for word in vocab:
            progress += 1
            vector = word_embed.emb(word) + char_embed.emb(word)
            string = ' '.join([str(v) for v in vector])
            out_file.write(word + ' ' + string + '\n')
            if progress % 1000 == 0:
                print("Retrieve 400-dim GK Embedding for the", progress,
                      "-th word ...")
    print('In total, process %d words in %s' % (len(vocab), dataset))
def cluster_kmeans_glove(sorted_sent_ids, id_sentence_map, num_clusters):
    d_emb = 300
    embeddings = GloveEmbedding('common_crawl_840',
                                d_emb=d_emb,
                                show_progress=True)

    vecs = np.zeros(shape=(len(sorted_sent_ids), 300))
    for idx, sent_id in enumerate(sorted_sent_ids):
        for token, _ in id_sentence_map[sent_id]:
            vecs[idx] += np.array(embeddings.emb(token.lower(), "zero"))
        vecs[idx] /= len(id_sentence_map[sent_id])

    if num_clusters is None:
        num_clusters = max(len(id_sentence_map) // 25, 2)

    #clusterer = AgglomerativeClustering(n_clusters=num_clusters)

    #clustering = clusterer.fit_predict(vecs)
    clustering = KMeans(n_clusters=num_clusters).fit_predict(vecs)

    return clustering
Example #10
0
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in tqdm(range(len(word2index.keys()))):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
Example #11
0
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    os.environ["HOME"] = "D:/ANAHOME"  # add HOME directory temporarily
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in tqdm(range(len(word2index.keys()))):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path):
    print("Dumping pretrained embeddings...")
    # import ssl
    # ssl._create_default_https_context = ssl._create_unverified_context
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    for i in range(len(word2index.keys())):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
Example #13
0
def gen_slot_embed_for_each_dom_from_glove(dom2slots, slot2desc, save_file):
    ## 1. generate slot2embs
    slots = list(sorted(slot2desc.keys()))
    desps = [slot2desc[k] for k in slots]
    word2emb = {}
    # collect words
    for des in desps:
        splits = des.split()
        for word in splits:
            if word not in word2emb:
                word2emb[word] = []
    
    # load embeddings
    glove_emb = GloveEmbedding()

    # calculate slot embs
    slot2embs = {}
    for i, slot in enumerate(slots):
        word_list = slot2desc[slot].split()
        embs = np.zeros(300)
        for word in word_list:
            embs = embs + glove_emb.emb(word, default='zero')
        slot2embs[slot] = embs

    ## 2. generate slot2embs based on each domain
    slot_embs_based_on_each_domain = {}
    for domain, slot_names in dom2slots.items():
        slot_embs = np.zeros((len(slot_names), 300))
        for i, slot in enumerate(slot_names):
            embs = slot2embs[slot]
            slot_embs[i] = embs
        slot_embs_based_on_each_domain[domain] = slot_embs
    
    with open(save_file, "wb") as f:
        pickle.dump(slot_embs_based_on_each_domain, f)
    return slot2embs
def dump_pretrained_emb_new(tokenizer, dump_path):
    print("Dumping pretrained embeddings...")
    embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
    E = []
    word_list = []
    for w, i in sorted(tokenizer.vocab.items(), key=lambda i: i[1]):
        word_list.append(w)
    for i in tqdm(range(len(word_list))):
        w = word_list[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
Example #15
0
    def __init__(self, device, use_glove=True, use_elmo=True):
        glove_size = 300 if use_glove else 0
        elmo_size = 1024 if use_elmo else 0
        super(WordEmbeddingModel, self).__init__(glove_size + elmo_size)

        if not use_glove and not use_elmo:
            raise ValueError("Should use at least one form of embedding.")

        if use_elmo:
            self._elmo = ElmoEmbedding(device=device)
        if use_glove:
            self._glove = GloveEmbedding(GLOVE_TRAIN_FILE, device=device)
        # if use_bert:
        #     self._bert = BertEmbedding(model_type='bert-large-cased', device=device)

        self._use_elmo = use_elmo
        # self._use_bert = use_bert
        self._use_glove = use_glove
def cluster_db_scan(sorted_sent_ids, id_sentence_map, num_clusters,
                    cluster_id_map):
    clusters = cluster_id_map

    d_emb = 300
    embeddings = GloveEmbedding('common_crawl_840',
                                d_emb=d_emb,
                                show_progress=True)

    X = fast_sents_to_embeddings(embeddings, 300,
                                 [[t[0] for t in id_sentence_map[sid]]
                                  for sid in sorted_sent_ids])

    avg_distances = []
    for cl, ids in clusters.items():
        X_cl = X[ids, :]
        dists = np.abs(euclidean_distances(X_cl))
        avg_distances.append(
            np.sum(dists) / max(1, X_cl.shape[0]**2 - X_cl.shape[0]))

    avg_intra_cl_dist = sum(avg_distances) / len(avg_distances)

    return DBSCAN(avg_intra_cl_dist, n_jobs=-1).fit_predict(X)
def cluster_clustering_kmeans(sorted_sent_ids, id_sentence_map, num_clusters,
                              cluster_id_map):
    clusters = cluster_id_map

    d_emb = 300
    embeddings = GloveEmbedding('common_crawl_840',
                                d_emb=d_emb,
                                show_progress=True)

    X = fast_sents_to_embeddings(embeddings, 300,
                                 [[t[0] for t in id_sentence_map[sid]]
                                  for sid in sorted_sent_ids])

    avg_distances = []
    X_cls = []
    for cl, ids in sorted(clusters.items()):
        X_cl = X[ids, :]
        dists = np.abs(euclidean_distances(X_cl))
        avg_distances.append(
            np.sum(dists) / max(1, X_cl.shape[0]**2 - X_cl.shape[0]))

        X_cls.append(X_cl.sum(axis=0))

    X_cls = np.stack(X_cls)

    avg_intra_cl_dist = sum(avg_distances) / len(avg_distances)

    cluster_clusters = KMeans(len(cluster_id_map) // 5).fit_predict(X_cls)

    clusters = [0 for _ in range(len(sorted_sent_ids))]

    for cl_id, cl_cluster_id in zip(sorted(cluster_id_map), cluster_clusters):
        for sent_id in cluster_id_map[cl_id]:
            clusters[sent_id] = cl_cluster_id

    return clusters
Example #18
0
def dump_pretrained_emb(word2index, index2word, dump_path, mode='en'):
    print("Dumping pretrained embeddings...")

    if mode == 'cn':
        embeddings = [CNEmbedding()]
    else:
        # embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        embeddings = [GloveEmbedding()]
    E = []
    count = [0., 0.]
    for i in tqdm(range(len(word2index.keys()))):
        w = index2word[i]
        e = []
        for emb in embeddings:
            e += emb.emb(w, default='zero')
        # stat embed existance
        count[1] += 1.
        if w in embeddings[0].word2vec:
            count[0] += 1.
        # e += [0.] * 300
        E.append(e)
    with open(dump_path, 'wt') as f:
        json.dump(E, f)
    print(f'word exists in embedding mat: {count[0]/count[1]*100}')
Example #19
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding
import numpy as np

from read_rules import read_rul, read_csv

g = GloveEmbedding('common_crawl_840',
                   d_emb=300,
                   show_progress=True,
                   default='zero')
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, k])

for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']:
    word = np.array(g.emb(w))
    word1 = np.array(k.emb(w))
    if None in word:
        print(w, ":\tbad embedding")
    else:
        print(w, ":\tgood embedding")
    out = np.append(word1, word)
    print(out.shape)

diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1'))
diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer'))

# print(np.abs(np.mean(diff1)))
# print(np.abs(np.mean(diff2)))

pdk15_csv = read_csv("calibreDRC_15.csv")
pdk45_csv = read_csv("calibreDRC_45.csv")
Example #20
0
class RuleEmbedding:
    def __init__(self, embedding_type, inputs):
        if embedding_type == "char":
            k = KazumaCharEmbedding()
            self.wordEmbed = k.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 100
        elif embedding_type == "glove":
            g = GloveEmbedding('common_crawl_840',
                               d_emb=300,
                               show_progress=True,
                               default='zero')
            self.wordEmbed = g.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 300
        elif embedding_type == "concat":
            self.k = KazumaCharEmbedding()
            self.g = GloveEmbedding('common_crawl_840',
                                    d_emb=300,
                                    show_progress=True,
                                    default='zero')
            self.wordEmbed = self.concatEmbed
            self.sentenceEmbed = self.embed_sentence
            self.size = 400
        # elif embedding_type == "bert":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "bert-stsb":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "universal":
        #     try:
        #         univEmbed = hub.load("./src/universal-sentence-encoder_4")
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = univEmbed
        #     self.size = 512

        else:
            print("Error: Embedding type \"%s\" not recognized" %
                  embedding_type)
            print(
                "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\""
            )
            exit(1)

        self.type = embedding_type
        self.pdk = inputs['pdk']
        self.features = inputs['features']
        self.weights = inputs['weights']
        self.word_counts = inputs['word_counts']
        self.a = inputs['a']
        self.number_replacement = inputs['number_replacement']
        self.remove_pc = inputs['remove_pc']
        self.weigh_capitals = inputs['weigh_capitals']

    ###############################################################################
    # Concatenates char and glove embeddings
    ###############################################################################
    def concatEmbed(self, word):
        one = np.array(self.k.emb(word))
        two = np.array(self.g.emb(word))
        return np.append(one, two)

    ###############################################################################
    # embed_sentence():
    # Returns list of embeddings for the provided sentences
    # If self.word_counts != None, computes a weighted average of the word embeddings
    # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF
    ###############################################################################
    def embed_sentence(self, text):
        embeddings = []
        N = len(text)
        for i in range(N):
            sentence = text[i]
            words = sentence.split(' ')
            num_words = len(words)
            total = np.zeros(self.size)

            for i in range(num_words):
                w = words[i].strip()

                # remove numbers
                if self.number_replacement and w.replace('.', '', 1).isdigit():
                    w = self.number_replacement

                embed = np.array(self.wordEmbed(w))

                # add weight to words that are all caps
                if self.weigh_capitals and w.isalpha() and w.isupper():
                    embed = self.weigh_capitals * embed

                # weigh words based on inverse of probability
                if self.word_counts and w in self.word_counts.keys():
                    prob = self.word_counts[w] / self.word_counts['total-words']
                    weight = self.a / (self.a + prob)
                    embed = weight * embed

                total += embed

            result = total / num_words
            embeddings.append(result)
        return embeddings

    ###############################################################################
    # embed_key():
    # Returns a matrix of sentence embeddings for the designated rule feature.
    # This can be "rule", "description", layer, name, etc.
    # Embedding type is set by self.embedding_type
    ###############################################################################
    def embed_key(self, key):
        pdk = self.pdk
        N = len(pdk)

        sentences = []
        for i in range(N):
            # in case we embed a feature like name, which is not a list
            if isinstance(pdk[i][key], list):
                s = ' '.join(pdk[i][key])
            else:
                s = pdk[i][key]
            sentences.append(s)

        result = np.array(self.sentenceEmbed(sentences))
        return result

    ###############################################################################
    # embed_all():
    # Compute rule embeddings using a weighted sum of the features.
    # Weights are stored in self.weights and features are stored in self.features.
    # Remove first principle component if self.useSIF == True
    ###############################################################################
    def embed_all(self):
        num_features = len(self.features)
        N = len(self.pdk)

        partial_embed = np.zeros((num_features, N, self.size))

        for i in range(num_features):
            result = self.embed_key(self.features[i])

            # remove first principle component
            if self.remove_pc:
                emb = remove_pc(result, 1)
                partial_embed[i] = emb
            else:
                partial_embed[i] = result

        # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2])
        output = np.tensordot(partial_embed, self.weights, axes=(0, 0))
        return output
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        ontology = Ontology()
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warn('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            ontology = ontology + dataset[s].extract_ontology()
            with open(os.path.join(dann, fname), 'wt') as f:
                json.dump(dataset[s].to_dict(), f)
        ontology.numericalize_(vocab)
        with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
            json.dump(ontology.to_dict(), f)
        with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
            json.dump(vocab.to_dict(), f)

        logging.warn('Computing word embeddings')
        embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(vocab._index2word):
            e = []
            for emb in embeddings:
                e += emb.emb(w, default='zero')
            E.append(e)
        with open(os.path.join(dann, 'emb.json'), 'wt') as f:
            json.dump(E, f)
Example #22
0
    def __init__(self, embedding_type, inputs):
        if embedding_type == "char":
            k = KazumaCharEmbedding()
            self.wordEmbed = k.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 100
        elif embedding_type == "glove":
            g = GloveEmbedding('common_crawl_840',
                               d_emb=300,
                               show_progress=True,
                               default='zero')
            self.wordEmbed = g.emb
            self.sentenceEmbed = self.embed_sentence
            self.size = 300
        elif embedding_type == "concat":
            self.k = KazumaCharEmbedding()
            self.g = GloveEmbedding('common_crawl_840',
                                    d_emb=300,
                                    show_progress=True,
                                    default='zero')
            self.wordEmbed = self.concatEmbed
            self.sentenceEmbed = self.embed_sentence
            self.size = 400
        # elif embedding_type == "bert":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "bert-stsb":
        #     try:
        #         bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens')
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = bertEmbed.encode
        #     self.size = 768

        # elif embedding_type == "universal":
        #     try:
        #         univEmbed = hub.load("./src/universal-sentence-encoder_4")
        #     except OSError as e:
        #         print(e)
        #         print("Could not find model in current directory: %s" % os.getcwd())
        #         exit(1)
        #     self.sentenceEmbed = univEmbed
        #     self.size = 512

        else:
            print("Error: Embedding type \"%s\" not recognized" %
                  embedding_type)
            print(
                "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\""
            )
            exit(1)

        self.type = embedding_type
        self.pdk = inputs['pdk']
        self.features = inputs['features']
        self.weights = inputs['weights']
        self.word_counts = inputs['word_counts']
        self.a = inputs['a']
        self.number_replacement = inputs['number_replacement']
        self.remove_pc = inputs['remove_pc']
        self.weigh_capitals = inputs['weigh_capitals']
Example #23
0
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding

g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True)
f = FastTextEmbedding()
k = KazumaCharEmbedding()
c = ConcatEmbedding([g, f, k])
for w in ['canada', 'vancouver', 'toronto']:
    print('embedding {}'.format(w))
    print(g.emb(w))
    print(f.emb(w))
    print(k.emb(w))
    print(c.emb(w))
Example #24
0
import json
import sys
sys.path.append('..')
import mgnn.config_train as args
import paths
import re
import csv
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from pretreatment.DataExtract import EntityLinking, GetPredicateList, Entity_Link_Falcon
from pretreatment.QueryFilter import *
from torchnlp.word_to_vector import FastText, GloVe
fasttext = FastText()
from embeddings import GloveEmbedding
g = GloveEmbedding('common_crawl_840', d_emb=300)
import math

def get_ngram(text, n):
    word_list = text
    res = []
    for i in range(len(word_list)):
        if i+n > len(word_list):
            break
        res.append(word_list[i:i+n])
    return res


def get_ngram_embedding(text, n):
    embeddings = []
    for i in range(len(text)):
Example #25
0
 def __init__(self):
     super(Word2vecUtils, self).__init__()
     self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300)
     self.initializer = lambda: np.random.normal(size=300).tolist()
Example #26
0
from embeddings import GloveEmbedding

embeddings_name = 'common_crawl_840'
embeddings_dimension = 300
glove_embeddings = GloveEmbedding(name=embeddings_name,
                                  d_emb=embeddings_dimension,
                                  show_progress=True)
Example #27
0
from embeddings import GloveEmbedding
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from collections import OrderedDict
import numpy as np

seed = 7
np.random.seed(seed)
ID_TEXT_DELIMITER = " <:sep:> "
"""
    0. initialize twitter word embeddings
"""
glove = GloveEmbedding(name="twitter", d_emb=50, show_progress=True)
"""
    1. Index tweets and it labels for error analysis
"""
labels = OrderedDict()
tweets = OrderedDict()
fp = open("../../../../resources/n-tweets-id_tokens.txt", 'r')
idx = 0
for sample in fp.readlines():
    label, tweet = sample.split(ID_TEXT_DELIMITER)
    if label.strip() == 'YES':
        labels[idx] = 1
    elif label.strip() == 'NO':
        labels[idx] = 0
    tokens_as_string = ''
    for tok in tweet.strip().replace("[",
                                     "").replace(']',
Example #28
0
                       fields=[('src', srcF), ('tgt', tgtF),
                               ('tgt_be', tgt_beF), ('dis', disF),
                               ('label', labelF)])
dev = TabularDataset(path=args.dev_data,
                     format='tsv',
                     fields=[('src', srcF), ('tgt', tgtF), ('tgt_be', tgt_beF),
                             ('dis', disF), ('label', labelF)])

tgt_beF.build_vocab(all_data, min_freq=1)
disF.build_vocab(all_data, min_freq=1)
srcF.build_vocab(all_data, min_freq=1)
vocab = srcF.vocab
tgtF.vocab = vocab
args.vocab_size = len(vocab)

g = GloveEmbedding('common_crawl_840', d_emb=300)
embedding = []
for i in range(len(vocab)):
    if not g.emb(vocab.itos[i])[0]:
        embedding.append(np.random.uniform(-0.25, 0.25, size=(1, 300))[0])
    else:
        embedding.append(np.array(g.emb(vocab.itos[i])))
embedding = np.array(embedding, dtype=np.float32)
args.pre_embedding = True
args.embedding = embedding
args.update_embedding = False

print('build batch iterator...')
train_batch_iterator = BucketIterator(dataset=train,
                                      batch_size=args.batch_size,
                                      sort=False,
Example #29
0
import sys
from embeddings import GloveEmbedding

if len(sys.argv) < 3:
    print("please provide embeddings and pos conl file")
    exit(0)

embs = GloveEmbedding(sys.argv[1], default="random")

unk = "<UNK>"

outFile = open(sys.argv[2] + ".glove", "w")
curSent = ""
for line in open(sys.argv[2]):
    if len(line) < 2:
        outFile.write(curSent + "\n")
        curSent = ""
    else:
        tok = line.strip().split("\t")
        emb = embs.emb(tok[0])

        embStr = "emb=" + ",".join([str(x) for x in emb])
        curSent += "\t".join(tok + [embStr]) + "\n"

outFile.close()
    def from_sentences(cls,
                       doc_sents,
                       id_sentence_map,
                       id_date_map,
                       num_date_anchors=100,
                       num_base_vecs=100,
                       normalize=False):
        start_time = time.time()

        date_freqs = Counter(s.predicted_date for s in doc_sents)

        sorted_dates_with_freq = sorted(date_freqs.items())
        num_sents = len(doc_sents)

        sents_per_bucket = num_sents // num_date_anchors

        date_buckets = {}
        curr_bucket = 0
        curr_freq_sum = 0
        for date, freq in sorted_dates_with_freq:
            date_buckets[date] = curr_bucket
            curr_freq_sum += freq

            if curr_freq_sum >= sents_per_bucket and curr_bucket < num_date_anchors:
                curr_bucket += 1
                curr_freq_sum = 0

        sent_and_date_buckets = defaultdict(lambda: ([], set()))

        for sent in doc_sents:
            date = sent.predicted_date
            bucket_sents, bucket_dates = sent_and_date_buckets[
                date_buckets[date]]

            bucket_sents.append(sent)
            bucket_dates.add(date)

        logger.debug("Computed date buckets (time: {}s)".format(time.time() -
                                                                start_time))
        start_time = time.time()

        d_emb = 300
        embeddings = GloveEmbedding('common_crawl_840',
                                    d_emb=d_emb,
                                    show_progress=True)

        checkpoints = []
        for sents, dates in sent_and_date_buckets.values():
            sent_vecs = sents_to_embeddings(
                embeddings, d_emb,
                [s.as_token_attr_sequence("form_lowercase") for s in sents])
            base_vecs, num_matches = create_compressed_sent_repr(
                sent_vecs, num_base_vecs)

            center_date = min(dates) + (max(dates) - min(dates)) / 2

            checkpoints.append(
                (center_date, base_vecs, num_matches, len(sents)))

        logger.debug("Computed checkpoints (time: {}s)".format(time.time() -
                                                               start_time))
        start_time = time.time()

        sents_by_date = defaultdict(list)
        for sent_id, sent in id_sentence_map.items():
            sent_date = id_date_map[sent_id]
            sents_by_date[sent_date].append(sent_id)

        sent_scores = {}
        for idx, (sents_date, sent_ids) in enumerate(sents_by_date.items()):
            print("{}/{}".format(idx, len(sents_by_date)))
            sent_sims = np.zeros(len(sent_ids))
            factor_sum = 0.0
            sent_vecs = fast_sents_to_embeddings(
                embeddings, d_emb,
                [[t[0] for t in id_sentence_map[id_]] for id_ in sent_ids])
            for check_date, base_vecs, num_matches, num_members in checkpoints:
                factor = 1.0 / (abs((check_date - sents_date).days) + 1)

                sent_signatures = (cosine_similarity(sent_vecs, base_vecs) >=
                                   0.0).astype(np.float32)
                sent_signatures *= num_matches.reshape(1, len(base_vecs))
                sent_signatures /= num_members

                cosine_sims = np.average(sent_signatures, axis=1)
                sent_sims += cosine_sims * factor

                factor_sum += factor
            sent_sims /= factor_sum

            for sid, score in zip(sent_ids, sent_sims):
                sent_scores[sid] = score

        logger.debug("Computed scores (time: {}s)".format(time.time() -
                                                          start_time))

        return BucketedCoverageFactor(sent_scores, normalize=normalize)