def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: from polyglot.mapping import Embedding if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logging.error('Please install `gensim` package first.') embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'fonseca': import numpy as np import os embeddings = np.load(os.path.join(name, 'types-features.npy')) texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read() words = set([w.strip() for w in texts.split('\n')]) self.itos = list(words) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings).view(-1, self.dim) self.unk_vector = self.vectors.mean(0).unsqueeze(0)
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: try: from polyglot.mapping import Embedding except ImportError: logger.error('Please install `polyglot` package first.') return None if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logger.error('Please install `gensim` package first.') return None embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary ) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'text': tokens = [] vectors = [] if self.binary: import pickle # vectors should be a dict mapping str keys to numpy arrays with open(name, 'rb') as f: d = pickle.load(f) tokens = list(d.keys()) vectors = list(d.values()) else: # each line should contain a token and its following fields # <token> <vector_value_1> ... <vector_value_n> with open(name, 'r', encoding='utf8') as f: for line in f: if line: # ignore empty lines fields = line.rstrip().split() tokens.append(fields[0]) vectors.append(list(map(float, fields[1:]))) self.itos = tokens self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.vectors = torch.Tensor(vectors) self.dim = self.vectors.shape[1]
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" if format == "word2vec_bin": # w = Embedding.from_word2vec(fname, binary=True) # w = KeyedVectors.load_word2vec_format('/home/boros/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True) w = KeyedVectors.load_word2vec_format(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname, "rb"), encoding='latin1') w = Embedding.from_dict(d) # if normalize: # w.normalize_words(inplace=True) # if lower or clean_words: # w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def train_mimic_model(polyglot_embedding_path: str, mimic_model_path: str, max_word_length: int, num_epochs: int, learning_rate: float, use_dev_set: bool): full_embedding = PolyEmbedding.load(str(polyglot_embedding_path)) embedding_size = len(full_embedding.zero_vector()) all_X, all_Y = compose_dataset(full_embedding, max_word_length) if use_dev_set: train_X = all_X[TEST_SET_SIZE:] train_Y = all_Y[TEST_SET_SIZE:] validation_data = (all_X[:TEST_SET_SIZE], all_Y[:TEST_SET_SIZE]) else: train_X, train_Y = all_X, all_Y validation_data = None model = create_mimic_model(max_word_length, embedding_size) optimizer = optimizers.Adam(lr=learning_rate) model.compile(optimizer, loss=mse_loss) if os.path.exists(mimic_model_path): model.load_weights(mimic_model_path) loss_to_monitor = 'val_loss' if use_dev_set else 'loss' save_model = ModelCheckpoint(mimic_model_path, verbose=1, monitor=loss_to_monitor, save_best_only=True) lr_reducer = ReduceLROnPlateau(verbose=1, factor=0.2, min_lr=1e-7, monitor=loss_to_monitor, cooldown=100) model.fit(train_X, train_Y, batch_size=1024, epochs=num_epochs, callbacks=[save_model, lr_reducer], validation_data=validation_data)
def __init__(self, student_summary=[]): self.punctuations = ['.', ',', '[', ']', '(', ')'] self.stop_words = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] self.student_summary = [self.clean_doc(s) for s in student_summary] self.embeddings = Embedding.load("data/embeddings_pkl.tar.bz2") self.summary_vetors = [] for summary in self.student_summary: self.summary_vetors.append(self.calculate_doc2vec(summary))
def __init__(self, config): self.config = config self.line_dict = {} self.mentions = [] self.sentences = [] self.all_word_average = 0 self.embeddings = Embedding() self.max_as_count = 0 self.test_rs = [] self.test_r_answers = [] self.test_answer_indices = [] self.test_r_antecedents = [] self.t_count = 0 self.t_dict = {} self.r_list = [] self.data = None self.init_data()
def getEmbeddings(lng): if lng not in EMBEDDINGS: home = expanduser("~") embeddings = Embedding.load(home + "/polyglot_data/embeddings2/" + lng + "/embeddings_pkl.tar.bz2") embeddings.apply_expansion(CaseExpander) EMBEDDINGS[lng] = embeddings return EMBEDDINGS[lng]
def _load_glove(glove_file, verbose=1): global glove glove = Embedding.from_glove(glove_file) if verbose == 2: print 'GloVe shape:', glove.shape print 'GloVe first 10:', glove.head(n=10) elif verbose == 1: print 'GloVe shape:', glove.shape return glove
def external_polygot_embedding(ver): ver = ver.replace('hn', 'hi') home = path.expanduser('~') emb = Embedding.load( path.join(home, 'polyglot_data/embeddings2/%s/embeddings_pkl.tar.bz2' % ver)) word_idx = {w: i for i, w in enumerate(emb.words)} if (ver == 'hi'): word_idx = transdict_stl(word_idx) embedding = emb.vectors return embedding, word_idx
def loadembedding(filename): """Loads a precomputed embedding into memory Input: filename: of the model file Output: embedding object """ embedding = Embedding.load(filename) # Apply useful extensions embedding.apply_expansion(DigitExpander) # We might need this if we want to ignore case # embedding.apply_expansion(CaseExpander) return embedding
def train_word_embeddings(train, polyglot_data): embeddings = Embedding.load(polyglot_data) zpadd = [0] * 64 train_embds = [] for t in train: t_e = [] for w in t: if w == u'*': t_e.append(zpadd) else: e = embeddings.get(w) if e is not None: t_e.append(e) else: t_e.append(zpadd) train_embds.append(t_e) return train_embds
def train_word_embeddings(train): embeddings = Embedding.load( "/home/amir/polyglot_data/embeddings2/fa/embeddings_pkl.tar.bz2") zpadd = [0] * 64 train_embds = [] for t in train: t_e = [] for w in t: if w == u'*': t_e.append(zpadd) else: e = embeddings.get(w) if e is not None: t_e.append(e) else: t_e.append(zpadd) train_embds.append(t_e) return train_embds
def __init__(self, coherence_measure, num_top_tokens, language=None): """ :param coherence_measure: Coherence measure to be used. Supported values are: 'u_mass', 'c_v', 'c_uci', 'c_npmi', :type coherence_measure: str :param num_top_tokens: Number of top tokens to extract from every topic. The terms will be used to determine the coherence of the topics. :type num_top_tokens: int :param language: Either 'german' or 'english'. It is required when the selected coherence measure is 'embedding_similarities' or 'embedding_variances' :type language: str """ if coherence_measure not in [ 'u_mass', 'embedding_similarities', 'embedding_variances' ]: raise Exception('{} is not a supported coherence measure'.format( coherence_measure)) self.coherence_measure = coherence_measure self.num_top_tokens = num_top_tokens self._embeddings = None if coherence_measure in [ 'embedding_similarities', 'embedding_variances' ]: if language is None: raise Exception( 'For word embedding based coherence measures a language has to be provided.' ' Either "german" or "english". ') if language == 'german': pass elif language == 'english': self._embeddings = Embedding.from_glove( "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt" ) else: raise Exception( 'Language {} is not supported. Either "german" or "english".' .format(language))
def create_list_file(): try: embeddings = Embedding.load( os.path.join(DOWNLOAD_DIR, "embeddings2/en/embeddings_pkl.tar.bz2")) except Exception as e: print e.message ActivityLog.objects.create_log( None, level='C', view_name='scrappers_miners.utils.utils.create_list_file', message='Error in loading library (polyglot) - %s' % e.message, traceback=traceback.format_exc()) return False else: neighbors = [] for word in FILTER_LIST_WORDS: try: neighbors += embeddings.nearest_neighbors( word, top_k=NEAREST_NEIGHBORS) except Exception as e: ActivityLog.objects.create_log( None, level='W', view_name='scrappers_miners.utils.utils.create_list_file', message= 'Error in finding neighbors of a word in FILTER_LIST_WORDS with a message - %s' % e.message, traceback=traceback.format_exc()) filter_words_file = open(FILTER_WORD_FILE_PATH, 'w') for n in set(neighbors + FILTER_LIST_WORDS): filter_words_file.write(n.lower() + '\n') filter_words_file.close() return True
def _extract_we_polyglot(output_file, vocab_file, we_dic): #vocabulary vocabf = codecs.open(vocab_file, "r", "utf-8") vocab = [] vecList = [] for line in vocabf: vocab.append(line.split(" ")[0]) #export embeddings = Embedding.load(we_dic) f = codecs.open(output_file, "w", "utf-8") for token in vocab: token = token.decode("utf-8") if token in embeddings: vector = embeddings[token].tolist() vector.insert(0, token) vecList.append(vector) else: print "====", token f.write("\n".join(" ".join(map(str, x)) for x in vecList)) f.close() vocabf.close()
def loadExternalTools(self): ### Load external tools ### # get ContoPt wordnetLoadTimeStart = time.time() wordnet = ContoPtReader.ContoPtLoader() elapsedTimeWordnetLoad = time.time() - wordnetLoadTimeStart print "\nWordnet loaded in " + str(elapsedTimeWordnetLoad) + " sec.]\n" # get word2vec model wordEmbeddingLoadTimeStart = time.time() wordEmbeddingsModel = Embedding.load( parameters.paths["wordEmbeddings"] + "/polyglot-pt.pkl") #wordEmbeddingsModel = (self.wordEmbeddingsModel).normalize_words() elapsedTimeWordEmbeddingLoad = time.time() - wordEmbeddingLoadTimeStart print "\nWord2vec model loaded in " + str( elapsedTimeWordEmbeddingLoad) + " sec.]\n" return (wordnet, wordEmbeddingsModel)
def __init__(self, embedding: PolyEmbedding, mimic_model: Model): self.embedding = embedding self.mimic_model = mimic_model self.max_word_length = K.int_shape(mimic_model.input)[1] self.embedding_size = K.int_shape(mimic_model.output)[1] assert len(embedding.zero_vector()) == self.embedding_size
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from os import curdir, sep from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne from polyglot.mapping import Embedding import json from tsne import tsne from word2vec import transform_text # from sklearn.manifold import TSNE PORT_NUMBER = 8080 polish_embeddings = Embedding.load("polyglot-pl.pkl") # ------------- t-SNE init ------------------------------ # model = TSNE(n_components=2, random_state=0) # np.set_printoptions(suppress=True) # tsne_rep = tsne(polish_embeddings.vectors) # This class will handles any incoming request from # the browser print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10)) class myHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path == "/sentence-find-near":
from src.Automati_Topic_Labeling_Wordnet.extrinsic_topic_labler import ExtrensicTopicLabeler from src.Automati_Topic_Labeling_Wordnet.wordnet_embeddings import Wordnet from src.Automati_Topic_Labeling_Wordnet.polyglot_embeddings import get_topic_labels as pl from itertools import combinations from polyglot.mapping import Embedding from itertools import chain from src.models import topic_models as tm """ Select the words out of a topic which have the smallest distance to each other. This kind of preprocessing shall improve the topic labelling. """ #embeddings = Embedding.load("D:/Bachelorarbeit/Projekte/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") embeddings = Embedding.from_glove( "D:/Bachelorarbeit/Projekte/tm-maria/models/word_embeddings/glove.6B/glove.6B.100d.txt" ) def topic_word_distance(word1, word2): """ Calculate the distance with word embeddings between two words :param word1: string :param word2: string :return: return the words and the distance """ try: dist = embeddings.distances(word1, [word2]) except KeyError: return return ((word1, word2), dist)
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from os import curdir, sep from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne from polyglot.mapping import Embedding import json from tsne import tsne from word2vec import transform_text # from sklearn.manifold import TSNE PORT_NUMBER = 8080 polish_embeddings = Embedding.load("polyglot-pl.pkl") # ------------- t-SNE init ------------------------------ # model = TSNE(n_components=2, random_state=0) # np.set_printoptions(suppress=True) # tsne_rep = tsne(polish_embeddings.vectors) # This class will handles any incoming request from # the browser print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10)) class myHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path=="/sentence-find-near":
def embedding(text, embeddingPATH): embeddings = Embedding.load(embeddingPATH) neighbors = embeddings.nearest_neighbors(text) for w, d in zip(neighbors, embeddings.distances(text, neighbors)): print("{}\n{}".format(w,d))
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format == 'polyglot': try: from polyglot.mapping import Embedding except ImportError: logger.error('Please install `polyglot` package first.') return None embeddings = Embedding.load(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'glove': itos = [] vectors = [] with open(name, 'r', encoding='utf8') as f: for line in f: try: values = line.rstrip().split() itos.append(values[0]) vectors.append([float(x) for x in values[1:]]) except ValueError as e: # ignore entries that look like: # by [email protected] 0.6882 -0.36436 ... continue self.itos = itos self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = len(vectors[0]) self.vectors = torch.tensor(vectors).view(-1, self.dim) elif self.emb_format == 'fasttext': try: from gensim.models import FastText except ImportError: logger.error('Please install `gensim` package first.') return None self.vectors = FastText.load_fasttext_format(name) self.itos = list(self.vectors.wv.vocab.keys()) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.unk_vector = self.vectors['<unk>'] self.dim = self.vectors.vector_size elif self.emb_format == 'word2vec': try: from gensim.models import KeyedVectors except ImportError: logger.error('Please install `gensim` package first.') return None embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'text': tokens = [] vectors = [] if self.binary: import pickle # vectors should be a dict mapping str keys to numpy arrays with open(name, 'rb') as f: d = pickle.load(f) tokens = list(d.keys()) vectors = list(d.values()) else: # each line should contain a token and its following fields # <token> <vector_value_1> ... <vector_value_n> with open(name, 'r', encoding='utf8') as f: for line in f: if line: # ignore empty lines fields = line.rstrip().split() tokens.append(fields[0]) vectors.append(list(map(float, fields[1:]))) self.itos = tokens self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.vectors = torch.tensor(vectors) self.dim = self.vectors.shape[1] elif self.emb_format == 'fonseca': import numpy as np import os embeddings = np.load(os.path.join(name, 'types-features.npy')) texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read() words = set([w.strip() for w in texts.split('\n')]) self.itos = list(words) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.shape[1] self.vectors = torch.tensor(embeddings).view(-1, self.dim) if self.unk_vector is None: self.unk_vector = self.vectors.mean(0).unsqueeze(0)
class DataUtil: def __init__(self, config): self.config = config self.line_dict = {} self.mentions = [] self.sentences = [] self.all_word_average = 0 self.embeddings = Embedding() self.max_as_count = 0 self.test_rs = [] self.test_r_answers = [] self.test_answer_indices = [] self.test_r_antecedents = [] self.t_count = 0 self.t_dict = {} self.r_list = [] self.data = None self.init_data() def get_embeddings(self): return self.embeddings def init_data(self): if os.path.exists(self.config.data_pkl_path): pkl_file = open(self.config.data_pkl_path, 'rb') self.data = pickle.load(pkl_file) self.max_as_count = len(self.data['mistake_lists'][0]) pkl_file.close() print('load pkl finished') else: self.build_line_dict() self.parse_data() self.compute_r_a_tuples() all_words = self.get_all_words() self.calc_word_average(all_words) def mention_pos(self, mention): line = self.sentences[mention[0]] m_count = len([ word[0] for word in line if word and ('n' in word[1] or word[1] == 't') ]) return [(mention[1] + 1) / m_count * 0.1] def distance_mentions(self, m1, m2): # [0,1,2,3,4,5-7,8-15,16-31,32-63,64+] d = abs(m2[1] - m1[1]) if d == 0: return [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif d == 1: return [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] elif d == 2: return [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] elif d == 3: return [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] elif d == 4: return [0, 0, 0, 0, 1, 0, 0, 0, 0, 0] elif d >= 5 and d <= 7: return [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] elif d >= 8 and d <= 15: return [0, 0, 0, 0, 0, 0, 1, 0, 0, 0] elif d >= 16 and d <= 31: return [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] elif d >= 32 and d <= 63: return [0, 0, 0, 0, 0, 0, 0, 0, 1, 0] else: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] def mention_equals(self, m1, m2): return m1[0] == m2[0] and m1[1] == m2[1] and m1[2] == m2[2] and m1[ 3] == m2[3] def distance_intervening_mentions(self, m1, m2): d = 0 start = False for m in self.mentions: if self.mention_equals(m, m1): start = True if self.mention_equals(m, m2): break if start: d += 1 if d == 0: return [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] elif d == 1: return [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] elif d == 2: return [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] elif d == 3: return [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] elif d == 4: return [0, 0, 0, 0, 1, 0, 0, 0, 0, 0] elif d >= 5 and d <= 7: return [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] elif d >= 8 and d <= 15: return [0, 0, 0, 0, 0, 0, 1, 0, 0, 0] elif d >= 16 and d <= 31: return [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] elif d >= 32 and d <= 63: return [0, 0, 0, 0, 0, 0, 0, 0, 1, 0] else: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] def is_overlap(self, m1, m2): if m1[2] == m2[2]: return [1] return [0] def get_all_words(self): all_words = [] for sent in self.sentences: if sent: all_words += sent return all_words def calc_word_average(self, words): words = [word for word in words if word != ''] if len(words) == 0: return [np.float32(0.0)] * self.config.embedding_size average = sum([ self.embeddings.get(word[0], word[1], default=np.asarray([np.float32(0.0)] * self.config.embedding_size)) for word in words ]) / len(words) return nd.tolist(average) def build_line_dict(self): with open(self.config.result_path) as f: lines = f.readlines() for line in lines: line_num, word_index = line.strip().split() self.line_dict[int(line_num)] = int(word_index) def find_first_word_embedding(self, mention): line = self.sentences[mention[0]] assert line != [] return self.embeddings.get(line[0][0], line[0][1], default=np.asarray( [0.0] * self.config.embedding_size)) def find_last_word_embedding(self, mention): line = self.sentences[mention[0]] assert line != [] return self.embeddings.get(line[-1][0], line[-1][1], default=np.asarray( [0.0] * self.config.embedding_size)) def find_following(self, mention, word_num): line = copy.copy(self.sentences[mention[0]]) assert line != [] word_index = mention[1] for i in range(word_num): line.append('') following = line[word_index + 1:word_index + word_num + 1] del line return following def find_proceding(self, mention, word_num): line = copy.copy(self.sentences[mention[0]]) assert line != [] word_index = mention[1] for i in range(word_num): line = [''] + line proceding = line[word_index:word_index + word_num] del line return proceding def find_following_embeddings(self, mention, word_num): line = copy.copy(self.sentences[mention[0]]) assert line != [] word_index = mention[1] for i in range(word_num): line.append('None') following = line[word_index + 1:word_index + word_num + 1] follow_embed = [] for follow in following: if follow == "None": follow_embed.append([0.0] * self.config.embedding_size) else: follow_embed.append( nd.tolist( self.embeddings.get( follow[0], follow[1], default=np.asarray([0.0] * self.config.embedding_size)))) del line # return flatten(follow_embed) return functools.reduce(lambda x, y: x + y, follow_embed) def find_proceding_embeddings(self, mention, word_num): line = copy.copy(self.sentences[mention[0]]) assert line != [] word_index = mention[1] for i in range(word_num): line = ['None'] + line proceding = line[word_index:word_index + word_num] proced_embed = [] for proced in proceding: if proced == "None": proced_embed.append([0.0] * self.config.embedding_size) else: proced_embed.append( nd.tolist( self.embeddings.get( proced[0], proced[1], default=np.asarray([0.0] * self.config.embedding_size)))) del line print('type of proced_embed is: ', type(proced_embed)) # return flatten(proced_embed) return functools.reduce(lambda x, y: x + y, proced_embed) def average_sent(self, mention): line = self.sentences[mention[0]] assert line != [] return self.calc_word_average(line) def parse_data(self): with open(self.config.data_path) as o_f: lines = o_f.readlines() self.lines = lines for line in lines: line = line.encode().decode('utf-8').strip().split( '---------->') words = line[0].split() r = line[1].strip() tups = [tuple(word.split('/')) for word in words] target_r = '' for tup_i in range(len(tups)): if tups[tup_i][1] == 'r' and tups[tup_i][0] == r: target_r = (tups[tup_i][0], tup_i) assert target_r self.r_list.append(target_r) self.sentences.append(tups) for line_num, word_index in self.line_dict.items(): target_mention_tup = (line_num, word_index, self.sentences[line_num][word_index][0], self.sentences[line_num][word_index][1]) if word_index > -1 and ('n' in target_mention_tup[3] or target_mention_tup[3] == 't') and len( self.sentences[line_num]) <= 50: line_mention = [] words = self.sentences[line_num] r = self.r_list[line_num] for i in range(len(words)): w_tup = words[i] word_type = w_tup[1] word_span = w_tup[0] # if not self.t_dict.has_key(word_type): if word_type not in self.t_dict: self.t_dict[word_type] = self.t_count self.t_count += 1 if 'n' in word_type or word_type == 't': mention_tup = (line_num, i, word_span, word_type) self.mentions.append(mention_tup) line_mention.append(mention_tup) elif word_span == r[0] and i == r[1]: r_tup = (line_num, i, word_span, 'r') if target_mention_tup[1] < i: self.test_rs.append(r_tup) self.test_r_answers.append(target_mention_tup) if line_mention: if len(line_mention) > self.max_as_count: self.max_as_count = len(line_mention) self.test_r_antecedents.append( copy.copy(line_mention)) else: print("shud not reach here", line_num) self.test_r_antecedents.append( [self.config.NA]) assert len(self.test_r_answers) == len( self.test_r_antecedents) == len( self.test_rs) def compute_r_a_tuples(self): for i in range(len(self.test_rs)): r = self.test_rs[i] ans = self.test_r_answers[i] found = False for k in range(len(self.test_r_antecedents[i])): test_ante = self.test_r_antecedents[i][k] if test_ante != self.config.NA and self.mention_equals( test_ante, ans): found = True self.test_answer_indices.append(k) if not found: print(r[0], r[1], r[2]) print(ans[0], ans[1], ans[2], ans[3]) def h(self, a, m): if a == 0 and m == 0: result = [np.float32(0.0)] * self.config.I return result if a == '#': a = m embed_a = nd.tolist( self.embeddings.get(a[2], a[3], default=np.asarray( [0.0] * self.config.embedding_size))) embed_m = nd.tolist( self.embeddings.get(m[2], m[3], default=np.asarray( [0.0] * self.config.embedding_size))) # print len(embed_m) first_aw_embed = nd.tolist(self.find_first_word_embedding(a)) # print len(first_aw_embed) first_mw_embed = nd.tolist(self.find_first_word_embedding(m)) # print len(first_mw_embed) last_aw_embed = nd.tolist(self.find_last_word_embedding(a)) # print len(last_aw_embed) last_mw_embed = nd.tolist(self.find_last_word_embedding(m)) # print len(last_mw_embed) proced2_a_embed = self.find_proceding_embeddings(a, 2) follow2_a_embed = self.find_following_embeddings(a, 2) proced2_m_embed = self.find_proceding_embeddings(m, 2) follow2_m_embed = self.find_following_embeddings(m, 2) avg5f_a = self.calc_word_average(self.find_following(a, 5)) # print len(avg5f_a) avg5p_a = self.calc_word_average(self.find_proceding(a, 5)) # print len(avg5p_a) avg5f_m = self.calc_word_average(self.find_following(m, 5)) # print len(avg5f_m) avg5p_m = self.calc_word_average(self.find_proceding(m, 5)) # print len(avg5p_m) avgsent_a = self.average_sent(a) # print len(avgsent_a) avgsent_m = self.average_sent(m) # print len(avgsent_m) avg_all = [self.all_word_average] # print len(avg_all) type_a = [self.t_dict[a[3]]] # self.type_dict[a[3]] type_m = [self.t_dict[m[3]]] # self.type_dict[m[3]] mention_pos_a = self.mention_pos(a) mention_pos_m = self.mention_pos(m) mention_len_a = [len(a[2])] mention_len_m = [len(m[2])] distance = self.distance_mentions(a, m) distance_m = self.distance_intervening_mentions(a, m) result = embed_a + first_aw_embed + last_aw_embed + proced2_a_embed + follow2_a_embed + avg5f_a + avg5p_a + avgsent_a + type_a + mention_pos_a + mention_len_a + embed_m + first_mw_embed + last_mw_embed + proced2_m_embed + follow2_m_embed + avg5f_m + avg5p_m + avgsent_m + type_m + mention_pos_m + mention_len_m + avg_all + distance + distance_m if len(result) != self.config.I: print(len(proced2_a_embed)) print(len(follow2_a_embed)) print(len(proced2_m_embed)) print(len(follow2_m_embed)) print(len(result)) # print sys.exit(0) return result def get_test_data(self, size, mode): if mode == 'test': if self.data: r_answers, h_r_antecedents = self.data['answer_indices'][ -size:], self.data['encoded_anted_lists'][-size:] return r_answers, h_r_antecedents rs_batch = self.test_rs[-size:] r_answers = self.test_answer_indices[-size:] r_antecedents = self.test_r_antecedents[-size:] else: if self.data: r_answers, h_r_antecedents = self.data[ 'answer_indices'][:size], self.data[ 'encoded_anted_lists'][:size] return r_answers, h_r_antecedents rs_batch = self.test_rs[:size] r_answers = self.test_answer_indices[:size] r_antecedents = self.test_r_antecedents[:size] h_r_antecedents = [] for combo_i in range(len(rs_batch)): combo_r = rs_batch[combo_i] combo_as = r_antecedents[combo_i] combos = [self.h(combo_a, combo_r) for combo_a in combo_as] padding = [np.float32(0.0)] * self.config.I combos.extend([padding] * (self.max_as_count - len(combos))) h_r_antecedents.append(combos) return r_answers, h_r_antecedents def mistake(self, a, T): if a == self.config.NA and T != self.config.NA: return self.config.a_fn if a != self.config.NA and T == self.config.NA: return self.config.a_fa if a != self.config.NA and a != T: return self.config.a_wl if a == T: return 0 def encode_mention_pairs(self, batch_Rs, batch_Ts, batch_As): if self.data: return batch_As, batch_Ts # batch_HTs = [] # for j in range(len(batch_Ts)): # t = batch_Ts[j] # r = batch_Rs[j] # ht = self.h(t, r) # batch_HTs.append(ht) # # hts = np.array(hts) batch_HAs = [] for z in range(len(batch_Rs)): As = batch_As[z] As = [A for A in As if A != self.config.NA] r = batch_Rs[z] HA = [self.h(a, r) for a in As] padding = [np.float32(0.0)] * self.config.I HA.extend([padding] * (self.max_as_count - len(HA))) batch_HAs.append(HA) return batch_HAs def get_shuffled_data_set(self): if self.data: seed = random.random() Rs, As, Ts, mistakes, Ans_indices = self.data[ 'rs'][:-self.config.test_batch_size], self.data[ 'encoded_anted_lists'][:-self.config.test_batch_size], self.data[ 'encoded_answer_pairs'][:-self.config.test_batch_size], self.data[ 'mistake_lists'][:-self.config. test_batch_size], self.data[ 'answer_indices'][:-self. config. test_batch_size] random.shuffle(Rs, lambda: seed) random.shuffle(As, lambda: seed) random.shuffle(Ts, lambda: seed) random.shuffle(mistakes, lambda: seed) random.shuffle(Ans_indices, lambda: seed) return Rs, As, Ts, mistakes, Ans_indices random_indices = [ randi for randi in range( len(self.test_rs) - self.config.test_batch_size) ] random.shuffle(random_indices) Rs = [] As = [] Ts = [] Ans_indices = [] for rand_i in random_indices: Rs.append(self.test_rs[rand_i]) As.append(self.test_r_antecedents[rand_i]) Ts.append(self.test_r_answers[rand_i]) Ans_indices.append(self.test_answer_indices[rand_i]) mistakes = [] for k in range(len(Ts)): T = Ts[k] A = As[k] mistake = [np.float32(self.mistake(a, T)) for a in A] mistake.extend([np.float32(0.0)] * (self.max_as_count - len(mistake))) mistakes.append(mistake) return Rs, As, Ts, mistakes, Ans_indices def pre_encode_data(self): assert len(self.test_r_antecedents) == len(self.test_rs) == len( self.test_r_answers) == len(self.test_answer_indices) encoded_anted_lists = [] encoded_answer_pairs = [] mistake_lists = [] for i in range(len(self.test_rs)): r = self.test_rs[i] anteds = self.test_r_antecedents[i] answer = self.test_r_answers[i] encoded_anteds = [self.h(anted, r) for anted in anteds] padding = [np.float32(0.0)] * self.config.I encoded_anteds.extend([padding] * (self.max_as_count - len(encoded_anteds))) encoded_anted_lists.append(encoded_anteds) encoded_answer = self.h(answer, r) encoded_answer_pairs.append(encoded_answer) mistakes = [self.mistake(anted, answer) for anted in anteds] mistakes.extend([np.float32(0.0)] * (self.max_as_count - len(mistakes))) mistake_lists.append(mistakes) assert len(encoded_answer_pairs) == len(encoded_anted_lists) == len( self.test_rs) == len( self.test_answer_indices) == len(mistake_lists) pickle_dict = { 'encoded_anted_lists': encoded_anted_lists, 'encoded_answer_pairs': encoded_answer_pairs, 'mistake_lists': mistake_lists, 'rs': self.test_rs, 'answer_indices': self.test_answer_indices, 'r_antecedents': self.test_r_antecedents, 'answers': self.test_r_answers } output = open('data.pkl', 'wb') pickle.dump(pickle_dict, output) output.close()
def computewordembedding(texts): """Creates a new word embedding using a given list of tokenized texts""" # Compute model model = Word2Vec(texts, size=100, window=5, min_count=5, workers=8) # Transform to polyglot model return Embedding.from_gensim(model)
def load_embeddings(data_root, languages): return { l: Embedding.load(data_root + (f"embeddings/{l}.tar.bz2")) for l in languages }
def categorize_tweets(currentTwitterAccount, n_max_tweets=5, settings=None): if not settings: settings = load_from_config() subscription_key = settings["subscription_key"] api_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.0/" key_phrase_api_url = api_url + "keyPhrases" language_api_url = api_url + "languages" embeddings = Embedding.load(settings["model_location"]) consumer_key = settings["consumer_key"] consumer_secret = settings["consumer_secret"] access_token = settings["access_token"] access_token_secret = settings["access_token_secret"] auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) # Fetch swedish tweets def language_check(string): headers = {"Ocp-Apim-Subscription-Key": subscription_key} response = requests.post(language_api_url, headers=headers, json={"documents": [{"id": 1, "text": string}]}) if response.ok: return response.json()["documents"][0]["detectedLanguages"][0]["iso6391Name"] else: if response.status_code == 429: time.sleep(1) return language_check(string) response.raise_for_status() documents = {"documents": []} tweets_raw = [] i = 0 for tweet in tweepy.Cursor(api.user_timeline, id=currentTwitterAccount, tweet_mode="extended").items(n_max_tweets): # removing the http link at the end of the text result = re.sub(r"http\S+", "", tweet.full_text) if language_check(result) == "sv": documents['documents'].append({'id': i, 'language': 'sv', 'text': result}) tweets_raw.append((result, tweet.created_at)) i += 1 ### Extract key words headers = {"Ocp-Apim-Subscription-Key": subscription_key} response = requests.post(key_phrase_api_url, headers=headers, json=documents) key_phrases = response.json() # Parse key words key_words = [[y for y in x.values()][0] for x in key_phrases["documents"]] key_words = [[y.split(" ") for y in x] for x in key_words] key_words = [[y.strip() for sublist in l for y in sublist] for l in key_words] ### Determine closest category for the sets of key words def embedding_distances(word, category): # Adapter to handle missing words for embedding model try: return embeddings.distances(word, category) except: return [1e16] # If word is not present, return big integer.. def topic(word): # Determine category score for word topic_list = [embedding_distances(word.lower(), category) for category in CATEGORIES] # compute distances to categories topic_list = [min(l) for l in topic_list] # compute average of each sublist min_value = min(topic_list) return topic_list.index(min_value), min_value topic_dists = [[topic(word) for word in l] for l in key_words] def cluster_topics(topic_dist): topic_dict = {} for t in topic_dist: if t[0] in topic_dict: topic_dict[t[0]] = (min(topic_dict[t[0]][0], t[1]), topic_dict[t[0]][1] + 1) else: topic_dict[t[0]] = (t[1], 1) topics = [(key, value[0]) for key, value in topic_dict.items()] values = [x[1] for x in topics] return topics[values.index(min(values))] categorized_tweets = [{"text": tweets_raw[i][0], "category": CATEGORY_NAMES[cluster_topics(topic_dists[i])[0]], "time": str(tweets_raw[i][1])} for i in range(len(topic_dists))] return categorized_tweets
import os import glob import sqlite3 from polyglot.text import Text, Word from polyglot.downloader import downloader from polyglot.mapping import Embedding downloader.download("embeddings2.pt") downloader.download("pos2.pt") downloader.download("morph2.pt") downloader.supported_tasks(lang="pt") embeddings = Embedding.load( "/Users/emersonantonio/polyglot_data/embeddings2/pt/embeddings_pkl.tar.bz2" ) #neighbors = embeddings.nearest_neighbors("verde") #for w,d in zip(neighbors, embeddings.distances("green", neighbors)): # print("{:<8}{:.4f}".format(w,d)) # Criar o Banco de Dados con = sqlite3.connect('./db/dadosDipolNLTK.db') cur = con.cursor() sql_create = 'CREATE TABLE IF NOT EXISTS miniDicionario '\ '(' \ ' id integer primary key AUTOINCREMENT, '\ ' word varchar(50), ' \ ' radical varchar(50), ' \ ' tag varchar(50)' \ ')'
def GloveEmbedding(embedding_dim): #res = PolyglotEmbedding.from_glove("/home/is/seiya-ka/embedding_vector/glove.twitter.27B."+str(embedding_dim)+"d.txt") res = PolyglotEmbedding.from_glove( "/home/is/seiya-ka/embedding_vector/glove.6B." + str(embedding_dim) + "d.txt") return res
def langmodelload(language): ######################## global stop_words global question_words global embeddings global model global lang_dict ######################## LibLocLang = "./udpipe-ud/" ######################## if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') elif language == "vi": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') elif language == "hi": model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe') elif language == "jp": model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe') elif language == 'es': model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe') ######################## base_question_words = [ 'where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can" ] question_words = [] for i in range(0, len(base_question_words)): question_words.append( Text(base_question_words[i]).transliterate(language)) ######################## if stopwords.has_lang( language ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms": ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: text = Text(stop_words[i], hint_language_code=language) ######################## if (text.pos_tags[0][1] != "NOUN") and ( text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1] != "PRON"): stop_words_list.append(text.pos_tags[0][0]) except Exception as e: print(e) stop_words = stop_words_list else: print(language + " has errors.") stop_words = [] ######################## ######################## embeddings = Embedding.load("./polyglot_data/embeddings2/" + language + "/embeddings_pkl.tar.bz2") lang_dict[language] = { 'model': model, 'embeddings': embeddings, 'stop_words': stop_words }
def load_embedding(self): path = os.path.join(self.c["data_root"], "embeddings", self.c["language"] + ".tar.bz2") return PolyglotEmbedding.load(path)
def load(cls, polyglot_embedding_path: str, mimic_model_path: str): e = PolyEmbedding.load(polyglot_embedding_path) model = load_model(mimic_model_path, compile=False) return cls(e, model)
#This gives only the polyglot embeddings. import numpy as np from polyglot.mapping import Embedding import pickle from pos_helper import * from nltk import pos_tag src_embeddings = Embedding.load( "/home/krishna/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") tar_embeddings = Embedding.load( "/home/krishna/polyglot_data/embeddings2/de/embeddings_pkl.tar.bz2") def make_align_dict(inp, nwords): inplist = inp.split() aldict = {} for j in range(nwords): aldict[j] = [] for j in inplist: a, b = j.split('-') a, b = int(a), int(b) if b not in aldict: aldict[b] = [] aldict[b].append(a) return aldict def get_target_embedding(ind, inlist): try: e2 = tar_embeddings[inlist[ind]]
def test_polyglot(self) : from polyglot.mapping import Embedding embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") neighbors = embeddings.nearest_neighbors("green")