Ejemplo n.º 1
0
    def cache(self, name, cache, url=None, max_vectors=None):
        if self.emb_format in ['polyglot', 'glove']:
            from polyglot.mapping import Embedding
            if self.emb_format == 'polyglot':
                embeddings = Embedding.load(name)
            else:
                embeddings = Embedding.from_glove(name)
            self.itos = embeddings.vocabulary.id_word
            self.stoi = embeddings.vocabulary.word_id
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)
        elif self.emb_format in ['word2vec', 'fasttext']:
            try:
                from gensim.models import KeyedVectors
            except ImportError:
                logging.error('Please install `gensim` package first.')

            embeddings = KeyedVectors.load_word2vec_format(
                name, unicode_errors='ignore', binary=self.binary)
            self.itos = embeddings.index2word
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.vector_size
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)
        elif self.emb_format == 'fonseca':
            import numpy as np
            import os
            embeddings = np.load(os.path.join(name, 'types-features.npy'))
            texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read()
            words = set([w.strip() for w in texts.split('\n')])
            self.itos = list(words)
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings).view(-1, self.dim)
        self.unk_vector = self.vectors.mean(0).unsqueeze(0)
Ejemplo n.º 2
0
def _load_glove(glove_file, verbose=1):
    global glove
    glove = Embedding.from_glove(glove_file)
    if verbose == 2:
        print 'GloVe shape:', glove.shape
        print 'GloVe first 10:', glove.head(n=10)
    elif verbose == 1:
        print 'GloVe shape:', glove.shape
    return glove
Ejemplo n.º 3
0
    def cache(self, name, cache, url=None, max_vectors=None):
        if self.emb_format in ['polyglot', 'glove']:
            try:
                from polyglot.mapping import Embedding
            except ImportError:
                logger.error('Please install `polyglot` package first.')
                return None
            if self.emb_format == 'polyglot':
                embeddings = Embedding.load(name)
            else:
                embeddings = Embedding.from_glove(name)
            self.itos = embeddings.vocabulary.id_word
            self.stoi = embeddings.vocabulary.word_id
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format in ['word2vec', 'fasttext']:
            try:
                from gensim.models import KeyedVectors
            except ImportError:
                logger.error('Please install `gensim` package first.')
                return None
            embeddings = KeyedVectors.load_word2vec_format(
                name, unicode_errors='ignore', binary=self.binary
            )
            self.itos = embeddings.index2word
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.vector_size
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format == 'text':
            tokens = []
            vectors = []
            if self.binary:
                import pickle

                # vectors should be a dict mapping str keys to numpy arrays
                with open(name, 'rb') as f:
                    d = pickle.load(f)
                    tokens = list(d.keys())
                    vectors = list(d.values())
            else:
                # each line should contain a token and its following fields
                # <token> <vector_value_1> ... <vector_value_n>
                with open(name, 'r', encoding='utf8') as f:
                    for line in f:
                        if line:  # ignore empty lines
                            fields = line.rstrip().split()
                            tokens.append(fields[0])
                            vectors.append(list(map(float, fields[1:])))
            self.itos = tokens
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.vectors = torch.Tensor(vectors)
            self.dim = self.vectors.shape[1]
Ejemplo n.º 4
0
def load_embedding(fname,
                   format="word2vec_bin",
                   normalize=True,
                   lower=False,
                   clean_words=False,
                   load_kwargs={}):
    """
    Loads embeddings from file

    Parameters
    ----------
    fname: string
      Path to file containing embedding

    format: string
      Format of the embedding. Possible values are:
      'word2vec_bin', 'word2vec', 'glove', 'dict'

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.
    """
    assert format in ['word2vec_bin', 'word2vec', 'glove',
                      'dict'], "Unrecognized format"
    if format == "word2vec_bin":
        #        w = Embedding.from_word2vec(fname, binary=True)
        #        w = KeyedVectors.load_word2vec_format('/home/boros/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)
        w = KeyedVectors.load_word2vec_format(fname, binary=True)
    elif format == "word2vec":
        w = Embedding.from_word2vec(fname, binary=False)
    elif format == "glove":
        w = Embedding.from_glove(fname, **load_kwargs)
    elif format == "dict":
        d = pickle.load(open(fname, "rb"), encoding='latin1')
        w = Embedding.from_dict(d)


#    if normalize:
#        w.normalize_words(inplace=True)
#    if lower or clean_words:
#        w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
    return w
Ejemplo n.º 5
0
    def __init__(self, coherence_measure, num_top_tokens, language=None):
        """

        :param coherence_measure: Coherence measure to be used. Supported values are: 'u_mass', 'c_v', 'c_uci', 'c_npmi',
        :type coherence_measure: str
        :param num_top_tokens: Number of top tokens to extract from every topic. The terms will be used to determine the coherence of the topics.
        :type num_top_tokens: int
        :param language: Either 'german' or 'english'. It is required when the selected coherence measure is 'embedding_similarities' or 'embedding_variances'
        :type language: str


        """

        if coherence_measure not in [
                'u_mass', 'embedding_similarities', 'embedding_variances'
        ]:
            raise Exception('{} is not a supported coherence measure'.format(
                coherence_measure))

        self.coherence_measure = coherence_measure
        self.num_top_tokens = num_top_tokens
        self._embeddings = None

        if coherence_measure in [
                'embedding_similarities', 'embedding_variances'
        ]:
            if language is None:
                raise Exception(
                    'For word embedding based coherence measures a language has to be provided.'
                    ' Either "german" or "english". ')
            if language == 'german':
                pass
            elif language == 'english':
                self._embeddings = Embedding.from_glove(
                    "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt"
                )
            else:
                raise Exception(
                    'Language {} is not supported. Either "german" or "english".'
                    .format(language))
Ejemplo n.º 6
0
from src.Automati_Topic_Labeling_Wordnet.extrinsic_topic_labler import ExtrensicTopicLabeler
from src.Automati_Topic_Labeling_Wordnet.wordnet_embeddings import Wordnet
from src.Automati_Topic_Labeling_Wordnet.polyglot_embeddings import get_topic_labels as pl
from itertools import combinations
from polyglot.mapping import Embedding
from itertools import chain
from src.models import topic_models as tm
"""
    Select the words out of a topic which have the smallest distance to each other. 
    This kind of preprocessing shall improve the topic labelling.

"""
#embeddings = Embedding.load("D:/Bachelorarbeit/Projekte/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2")
embeddings = Embedding.from_glove(
    "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt"
)


def topic_word_distance(word1, word2):
    """
    Calculate the distance with word embeddings between two words

    :param word1: string
    :param word2: string
    :return: return the words and the distance
    """
    try:
        dist = embeddings.distances(word1, [word2])
    except KeyError:
        return
    return ((word1, word2), dist)
Ejemplo n.º 7
0
def GloveEmbedding(embedding_dim):
    #res = PolyglotEmbedding.from_glove("/home/is/seiya-ka/embedding_vector/glove.twitter.27B."+str(embedding_dim)+"d.txt")
    res = PolyglotEmbedding.from_glove(
        "/home/is/seiya-ka/embedding_vector/glove.6B." + str(embedding_dim) +
        "d.txt")
    return res