def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: from polyglot.mapping import Embedding if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logging.error('Please install `gensim` package first.') embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'fonseca': import numpy as np import os embeddings = np.load(os.path.join(name, 'types-features.npy')) texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read() words = set([w.strip() for w in texts.split('\n')]) self.itos = list(words) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings).view(-1, self.dim) self.unk_vector = self.vectors.mean(0).unsqueeze(0)
def _load_glove(glove_file, verbose=1): global glove glove = Embedding.from_glove(glove_file) if verbose == 2: print 'GloVe shape:', glove.shape print 'GloVe first 10:', glove.head(n=10) elif verbose == 1: print 'GloVe shape:', glove.shape return glove
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: try: from polyglot.mapping import Embedding except ImportError: logger.error('Please install `polyglot` package first.') return None if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logger.error('Please install `gensim` package first.') return None embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary ) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'text': tokens = [] vectors = [] if self.binary: import pickle # vectors should be a dict mapping str keys to numpy arrays with open(name, 'rb') as f: d = pickle.load(f) tokens = list(d.keys()) vectors = list(d.values()) else: # each line should contain a token and its following fields # <token> <vector_value_1> ... <vector_value_n> with open(name, 'r', encoding='utf8') as f: for line in f: if line: # ignore empty lines fields = line.rstrip().split() tokens.append(fields[0]) vectors.append(list(map(float, fields[1:]))) self.itos = tokens self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.vectors = torch.Tensor(vectors) self.dim = self.vectors.shape[1]
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" if format == "word2vec_bin": # w = Embedding.from_word2vec(fname, binary=True) # w = KeyedVectors.load_word2vec_format('/home/boros/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True) w = KeyedVectors.load_word2vec_format(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname, "rb"), encoding='latin1') w = Embedding.from_dict(d) # if normalize: # w.normalize_words(inplace=True) # if lower or clean_words: # w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def __init__(self, coherence_measure, num_top_tokens, language=None): """ :param coherence_measure: Coherence measure to be used. Supported values are: 'u_mass', 'c_v', 'c_uci', 'c_npmi', :type coherence_measure: str :param num_top_tokens: Number of top tokens to extract from every topic. The terms will be used to determine the coherence of the topics. :type num_top_tokens: int :param language: Either 'german' or 'english'. It is required when the selected coherence measure is 'embedding_similarities' or 'embedding_variances' :type language: str """ if coherence_measure not in [ 'u_mass', 'embedding_similarities', 'embedding_variances' ]: raise Exception('{} is not a supported coherence measure'.format( coherence_measure)) self.coherence_measure = coherence_measure self.num_top_tokens = num_top_tokens self._embeddings = None if coherence_measure in [ 'embedding_similarities', 'embedding_variances' ]: if language is None: raise Exception( 'For word embedding based coherence measures a language has to be provided.' ' Either "german" or "english". ') if language == 'german': pass elif language == 'english': self._embeddings = Embedding.from_glove( "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt" ) else: raise Exception( 'Language {} is not supported. Either "german" or "english".' .format(language))
from src.Automati_Topic_Labeling_Wordnet.extrinsic_topic_labler import ExtrensicTopicLabeler from src.Automati_Topic_Labeling_Wordnet.wordnet_embeddings import Wordnet from src.Automati_Topic_Labeling_Wordnet.polyglot_embeddings import get_topic_labels as pl from itertools import combinations from polyglot.mapping import Embedding from itertools import chain from src.models import topic_models as tm """ Select the words out of a topic which have the smallest distance to each other. This kind of preprocessing shall improve the topic labelling. """ #embeddings = Embedding.load("D:/Bachelorarbeit/Projekte/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") embeddings = Embedding.from_glove( "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt" ) def topic_word_distance(word1, word2): """ Calculate the distance with word embeddings between two words :param word1: string :param word2: string :return: return the words and the distance """ try: dist = embeddings.distances(word1, [word2]) except KeyError: return return ((word1, word2), dist)
def GloveEmbedding(embedding_dim): #res = PolyglotEmbedding.from_glove("/home/is/seiya-ka/embedding_vector/glove.twitter.27B."+str(embedding_dim)+"d.txt") res = PolyglotEmbedding.from_glove( "/home/is/seiya-ka/embedding_vector/glove.6B." + str(embedding_dim) + "d.txt") return res