def test_embedding(): from gensim.models import KeyedVectors from sematch.utility import FileIO from sematch.semantic.relatedness import WordRelatedness model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True) model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True) rel = WordRelatedness(model_news) print(rel.word_similarity('happy','sad'))
def setUp(self): self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"), ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") ] self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")] self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False) self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
def __init__(self): print("Loading in word vectors...") self.word_vectors = KeyedVectors.load_word2vec_format( '../large_files/GoogleNews-vectors-negative300.bin', binary=True ) print("Finished loading in word vectors")
def load(self, *args, **kwargs) -> KeyedVectors: """ Load dict of embeddings from given file Args: *args: arguments **kwargs: arguments Returns: """ # Check that header with n_words emb_dim present with open(self.load_path, encoding='utf8') as f: header = f.readline() if len(header.split()) != 2: raise RuntimeError('The GloVe file must start with number_of_words embeddings_dim line! ' 'For example "40000 100" for 40000 words vocabulary and 100 embeddings ' 'dimension.') if self.load_path and self.load_path.is_file(): log.info("[loading embeddings from `{}`]".format(self.load_path)) model_file = str(self.load_path) model = KeyedVectors.load_word2vec_format(model_file) else: log.error('No pretrained GloVe model provided or provided load_path "{}" is incorrect.' .format(self.load_path)) sys.exit(1) return model
def testConversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) with smart_open(self.metadata_file, 'rb') as f: metadata = f.readlines() with smart_open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec with smart_open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) self.assertTrue(len(metadata) == len(vectors) == number_words, ('Metadata file %s and tensor file %s imply different number of rows.' % (self.metadata_file, self.tensor_file))) # grab metadata and vectors from written file metadata = [word.strip() for word in metadata] vectors = [vector.replace(b'\t', b' ') for vector in vectors] # get the originaly vector KV model orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) # check that the KV model and tensor files have the same values key-wise for word, vector in zip(metadata, vectors): word_string = word.decode("utf8") vector_string = vector.decode("utf8") vector_array = np.array(list(map(float, vector_string.split()))) np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
def get_model(): """ Download model :return: `gensim` model """ return KeyedVectors.load_word2vec_format(_download(), binary=True)
def testAnnoyIndexingOfKeyedVectors(self): from gensim.similarities.index import AnnoyIndexer keyVectors_file = datapath('lee_fasttext.vec') model = KeyedVectors.load_word2vec_format(keyVectors_file) index = AnnoyIndexer(model, 10) self.assertEqual(index.num_trees, 10) self.assertVectorIsSimilarToItself(model, index) self.assertApproxNeighborsMatchExact(model, model, index)
def initModel(self): path = self.getModelFilePath() modelFull = self.config.getBooleanConfig("common.model.full")[0] if modelFull: if self.model is None: self.model = Word2Vec.load(path) self.wv = self.model.wv else: if self.wv is None: self.wv = KeyedVectors.load(path, mmap='r')
def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: from gensim.models import KeyedVectors embeddings = KeyedVectors.load_word2vec_format(file_path) return embeddings
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = EuclideanKeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def __init__(self): self.cmdpairs = { "!similar": self.execute_cnb, "!similaryle": self.execute_yle, "!similarn": self.execute_n_cnb, "!similarnyle": self.execute_n_yle, "!similarnr": self.execute_n_cnb_r, "!similarnyler": self.execute_n_yle_r, "!xminusyplusz": self.execute_xyz_cnb, "!xminusypluszyle": self.execute_xyz_yle, # "!xminusyplusz": self.execute_x_minus_y_plus_z } self.cnb_wv = gensim.models.Word2Vec.load("./Resources/word2vec_2014-2019_04.model").wv self.yle_wv = KeyedVectors.load("./Resources/word2vec_yle_dersb")
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size): model = KeyedVectors.load_word2vec_format(filepath, binary=True) emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) for word, i in word_index.items(): if i >= max_features: continue try: embedding_vector = model[word] embedding_matrix[i] = embedding_vector except KeyError: continue return embedding_matrix
def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')
def wv(w1, w2, t): # lazy load the wordvector model... global wvmodel if wvmodel == None: print ' *', 'loading wordvector model (', modelFile, ')...' wvmodel = KeyedVectors.load_word2vec_format(modelFile, binary=False) wvmodel.init_sims(replace=True) # no more updates, prune memory try: # # since we've got wordnet synset objects (like cat.n.01), we # must turn this back into a regular word ('cat') because the # word vector GloVe models are plain words with spaces turned # into hyphens on phrases (e.g. climate-change, black-and-white) # wv_w1, wv_w2 = _mk_wv_word(w1), _mk_wv_word(w2) distance = wvmodel.similarity(wv_w1, wv_w2) return distance if abs(distance) >= t else 0 except: return 0
def load_embeddings(pytorch_embedding, word2idx, filename, embedding_size): print("Copying pretrained word embeddings from ", filename, flush=True) en_model = KeyedVectors.load_word2vec_format(filename) """ Fetching all of the words in the vocabulary. """ pretrained_words = set() for word in en_model.vocab: pretrained_words.add(word) arr = [0] * len(word2idx) for word in word2idx: index = word2idx[word] if word in pretrained_words: arr[index] = en_model[word] else: arr[index] = np.random.uniform(-1.0, 1.0, embedding_size) """ Creating a numpy dictionary for the index -> embedding mapping """ arr = np.array(arr) """ Add the word embeddings to the empty PyTorch Embedding object """ pytorch_embedding.weight.data.copy_(torch.from_numpy(arr)) return pytorch_embedding
def fit(self, X, y=None): dw_params = self.get_params() print dw_params if False: #exists(self.output_file): model = KeyedVectors.load_word2vec_format(self.output_file) else: model = run_gensim(dw_params) nb_vecs = len(model.wv.vocab) # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = np.asarray([model[str(node)] for node in range(nb_vecs)]) #features_matrix = np.random.randn((4,2)) if self.normalize: norms = np.linalg.norm(features_matrix, axis=1) if self.verbose: print norms print norms.shape assert norms.shape[0] == features_matrix.shape[0] for i in range(features_matrix.shape[0]): features_matrix[i,:] /= norms[i] norms = np.linalg.norm(features_matrix, axis=1) if self.verbose: print norms if self.verbose: print('features_matrix.shape = %s' % str(features_matrix.shape)) self.dw_params_ = dw_params self.gs_model_ = model self.features_matrix_ = features_matrix print('fit', self.features_matrix_.shape) return self
For more information on this file, see https://docs.djangoproject.com/en/1.10/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.10/ref/settings/ """ import os from gensim.models import KeyedVectors # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) settings_dir = os.path.dirname(__file__) PROJECT_ROOT = os.path.abspath(os.path.dirname(settings_dir)) MODEL_PATH = os.path.join(PROJECT_ROOT, 'apollo/w2v/GoogleNews-vectors-negative300.bin.gz') MODEL = KeyedVectors.load_word2vec_format(MODEL_PATH, unicode_errors = 'replace', binary = 'True', limit=10000) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = ')p1#0dnupk$xc59wdfl^%!7)4myi--la+xd4=$krk&a55$%0rz' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True ALLOWED_HOSTS = [] # Application definition
def eval_blogcat(embeddings_file, labels_matrix=None, G=None, verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]): # 0. Files #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec" if labels_matrix is None and G is None: G, labels_matrix = load_blogcat() # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) labels = np.argwhere(labels_matrix) label_cnts = pd.Series(labels[:,1]).value_counts() if verbose > 1: print('\nLabel counts:') print(label_cnts) # delete the least frequent labels, which causes balancing problems labels_matrix = labels_matrix[:, :-2] # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = np.asarray([model[str(node)] for node in range(len(G))]) if normalize: norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms print norms.shape assert norms.shape[0] == features_matrix.shape[0] for i in range(features_matrix.shape[0]): features_matrix[i,:] /= norms[i] norms = np.linalg.norm(features_matrix, axis=1) if verbose: print norms if verbose: print('-'*100) print(embeddings_file) print('features_matrix.shape = %s' % str(features_matrix.shape)) print('labels_matrix.shape = %s' % str(labels_matrix.shape)) # 2. Shuffle, to create train/test groups shuffles = [] number_shuffles = 1 for x in range(number_shuffles): # if we just have one group, make the split the same every time if number_shuffles == 1: shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123)) else: shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) # uncomment for all training percents #training_percents = np.asarray(range(1,10))*.1 for train_percent in training_percents: # print('-'*100) # print('pct_train: %.2f' % train_percent) for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train = y[:training_size] X_test = X[training_size:, :] y_test = y[training_size:] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train) # find out how many labels should be predicted #top_k_list = [len(l) for l in y_test] top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32) preds = clf.predict(X_test, top_k_list) if y_test.shape[1] != preds.shape[1]: raise Exception("imbalance of class dims") #continue results = OrderedDict() averages = ["micro", "macro", "samples", "weighted"] for average in averages: results[average] = f1_score(y_test, preds, average=average) all_results[train_percent].append(results) #break if verbose: print '-------------------' for train_percent in sorted(all_results.keys()): print 'Train percent:', train_percent for x in all_results[train_percent]: print x print '-------------------' return all_results
''' [[ 3.35454009e-03 -2.96757789e-03 8.95642443e-04 ..., 4.16836003e-03 -3.26405023e-03 -1.91481831e-03] ..., [ 7.19302261e-05 1.70022575e-03 3.59526509e-03 ..., 1.11010019e-03 3.70053225e-03 -3.61868995e-03]] ''' # 3、持久化模型 model.save('sample.en.text.model') model.wv.save_word2vec_format('sample.en.text.vector', binary=True) ''' save() 函数保存的完整的模型?额 wv.save_word2vec_format() 函数保存的其实就是词汇和对应向量,不过会丢失tree信息,所以无法进行增量训练 ''' # 4、加载持久化的模型,需与上面持久化的模型对应,此为方法一 new_model = Word2Vec.load('sample.en.text.model') print(new_model) # 4、加载持久化模型,方法二 from gensim.models import KeyedVectors filename = 'sample.en.text.vector' new_model = KeyedVectors.load_word2vec_format(filename, binary=True) # 参考: # [word2vec学习小记](https://www.jianshu.com/p/418f27df3968) # [How to Develop Word Embeddings in Python with Gensim](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/) # [gensim.model.word2vec API](https://radimrehurek.com/gensim/models/word2vec.html)
def calcfeatures(stancesFile, bodiesFile): path = os.path.abspath("") #gensim.models.KeyedVectors.load_word2vec_format #wmd_model = Word2Vec.load_word2vec_format('/data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz', binary=True) wmd_model = KeyedVectors.load_word2vec_format(path+'/data/GoogleNews-vectors-negative300.bin', binary=True) wmd_model.init_sims(replace=True) tknzr = TweetTokenizer() count = 0 features = [] classes = [] #N = getDocCount(path+'/data/training/train_bodies.csv') keys = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3} bodies = loadBodies(bodiesFile) bigram_vectorizer = CountVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 2), binary=False, lowercase=True, stop_words='english', min_df=1) vectorizer = TfidfVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 1), binary=False, lowercase=True, stop_words='english', min_df=1) tfidfMat = vectorizer.fit_transform(list(bodies.values())) tfidfMat = vectorizer.transform(list(bodies.values())) tfidfMat = tfidfMat.toarray() vocab = vectorizer.get_feature_names() k = list(bodies.keys()) bodiesTokens = loadBodiesTokens(bodiesFile) with open(stancesFile, 'r', encoding='UTF-8') as csvDataFile1: csvReader1 = csv.reader(csvDataFile1) first = 1 for row in csvReader1: f = [] if first == 1: first = 0 else: print(count) count = count + 1 #class classes.append(keys[row[2]]) #canberra distance f.append(feat.canberraDist(row[0],bodies[row[1]], bigram_vectorizer)) #polarity scores neg, neu, pos = feat.polarityScores(row[0], bodies[row[1]]) f.append(neg) f.append(neu) f.append(pos) tokens1 = tknzr.tokenize(row[0]) tokens1=[token.lower() for token in tokens1 if (token.isalpha() and token not in stop_words)] tokens2 = bodiesTokens[row[1]] #word movers distance f.append(feat.wmd(tokens1, tokens2,wmd_model)) #common words common = (set(tokens1) & set(tokens2)) f.append(feat.overlap(common)) #tfidf f.append(feat.tfidf(tfidfMat, common,vocab,k.index(row[1]))) #negations f.append(feat.negWords(tokens1,tokens2)) #add all features features.append(f) return np.array(features), np.array(classes)
# -*- coding:utf-8 -*- # 首先加载必用的库 import tensorflow as tf import numpy as np # import gensim 用来加载预训练word vector from gensim.models import KeyedVectors import jieba import matplotlib.pyplot as plt import re import warnings warnings.filterwarnings("ignore") # 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟 cn_model = KeyedVectors.load_word2vec_format('../static/embeddings/sgns.zhihu.bigram', binary=False, unicode_errors="ignore") # 由此可见每一个词都对应一个长度为300的向量 embedding_dim = cn_model['山东大学'].shape[0] print("embedding_dim:", embedding_dim) # 获得样本的索引 import pandas as pd data_neg = pd.read_excel('../static/data/neg9.xlsx') print('样本总数:'+str(len(data_neg))) print("data_neg.head(1)", data_neg.head(1)) # 将所有的评价内容放置到一个list里 train_texts_orig = [] # 文本所对应的labels,也就是标记 train_target = []
def main(): parser = ArgumentParser("scoring", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument("--emb", default='result.embeddings', help='Embeddings file') parser.add_argument( "--network", default='p2p.edgelist', help= 'A .mat file containing the adjacency matrix and node labels of the input network.' ) parser.add_argument( "--adj-matrix-name", default='network', help='Variable name of the adjacency matrix inside the .mat file.') parser.add_argument( "--label-matrix-name", default='group', help='Variable name of the labels matrix inside the .mat file.') parser.add_argument("--num-shuffles", default=2, type=int, help='Number of shuffles.') parser.add_argument( "--all", default=False, action='store_true', help= 'The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. ' 'By default, only training percents of 10, 50 and 90 are used.') args = parser.parse_args() # 0. Files embeddings_file = args.emb matfile = args.network # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) # 2. Load labels mat = loadmat(matfile) A = mat[args.adj_matrix_name] graph = sparse2graph(A) labels_matrix = mat[args.label_matrix_name] labels_count = labels_matrix.shape[1] mlb = MultiLabelBinarizer(range(labels_count)) # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = numpy.asarray( [model[str(node)] for node in range(len(graph))]) # 2. Shuffle, to create train/test groups shuffles = [] for x in range(args.num_shuffles): shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) if args.all: training_percents = numpy.asarray(range(1, 10)) * .1 else: training_percents = [0.1, 0.5, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = X[:training_size, :] y_train_ = y[:training_size] y_train = [[] for x in range(y_train_.shape[0])] cy = y_train_.tocoo() for i, j in zip(cy.row, cy.col): y_train[i].append(j) assert sum(len(l) for l in y_train) == y_train_.nnz X_test = X[training_size:, :] y_test_ = y[training_size:] y_test = [[] for _ in range(y_test_.shape[0])] cy = y_test_.tocoo() for i, j in zip(cy.row, cy.col): y_test[i].append(j) clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train_) # find out how many labels should be predicted top_k_list = [len(l) for l in y_test] preds = clf.predict(X_test, top_k_list) results = {} averages = ["micro", "macro"] for average in averages: results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) all_results[train_percent].append(results) print('Results, using embeddings of dimensionality', X.shape[1]) print('-------------------') for train_percent in sorted(all_results.keys()): print('Train percent:', train_percent) for index, result in enumerate(all_results[train_percent]): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in all_results[train_percent]: for metric, score in iteritems(score_dict): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(all_results[train_percent]) print('Average score:', dict(avg_score)) print('-------------------')
import random import string from sklearn.utils import shuffle from sklearn.model_selection import train_test_split from nltk.corpus import stopwords from gensim.models import KeyedVectors from tensorflow.keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences print("Loading embedder.......") # Embedder embedder = KeyedVectors.load_word2vec_format('/Users/petergramaglia/Documents/GitHub/new_connected/connected_journaling/data/GoogleNews-vectors-negative300.bin',binary=True) word_vectors = embedder.wv print("Reading dataset........") dataset = pd.read_csv(data_path) # 5k samples print("Y stuff") new_train_y = np.zeros(len(train_y)) new_test_y = np.zeros(len(test_y)) for i in range(0,len(train_y)):
import pandas as pd import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from gensim.models import KeyedVectors from keras.utils import to_categorical from keras.layers import Conv1D, GlobalMaxPooling1D from keras.models import Sequential from keras.layers import Activation, Dropout, Flatten, Dense import keras.optimizers from keras.layers import Embedding from collections import Counter import operator from sklearn.model_selection import train_test_split modelw2v = KeyedVectors.load_word2vec_format('./data/w2v_model_M.bin', binary=True) embeddings_index = {} for word in modelw2v.wv.vocab.keys(): embeddings_index[word] = modelw2v.wv[word] file_path = './data/train_tweets_mod.csv' dfs = pd.read_csv(file_path) num_entries = 0 users = dfs['User'].values.tolist() user_tweet_dict = Counter(users) sorted_user_tweet_dict = sorted(user_tweet_dict.items(), key=operator.itemgetter(1), reverse=True) values = user_tweet_dict.values() count_dict = Counter(values)
path_root = './data/' + dataset_name + '/' path_to_batches = path_root + '/batches_' + dataset_name + '/' model_path = "./V1/agg=sum_bidir=True_discount=1_cutgradient=False/" + dataset_name + "/run1/" path_to_save = './' path_to_functions = './' path_to_weights = model_path n_runs = 4 nb_epochs_train = 150 my_prec = 5 # nb of decimals to keep in history files runs = ['run%i' % i for i in range(n_runs)] # Loading vectors gensim_obj = KeyedVectors.load(path_root + 'word_vectors.kv', mmap='r') # needs an absolute path! word_vecs = gensim_obj.wv.syn0 # add Gaussian initialized vector on top of embedding matrix (for padding) pad_vec = np.random.normal(size=word_vecs.shape[1]) word_vecs = np.insert(word_vecs,0,pad_vec,0) # Defining Network ## Inputs sent_ints = Input(shape=(None,)) sent_wv = Embedding(input_dim=word_vecs.shape[0], output_dim=word_vecs.shape[1], weights=[word_vecs], input_length=None, # sentence size vary from batch to batch trainable=True )(sent_ints)
#from gensim.models import KeyedVectors #filename = 'GoogleNews-vectors-negative300.bin' #model = KeyedVectors.load_word2vec_format(filename, binary=True) from gensim.scripts.glove2word2vec import glove2word2vec glove_input_file = 'intents/glove.6B.100d.txt' word2vec_output_file = 'intents/glove.6B.100d.txt.word2vec' glove2word2vec(glove_input_file, word2vec_output_file) # In[3]: from gensim.models import KeyedVectors # load the Stanford GloVe model filename = 'intents/glove.6B.100d.txt.word2vec' model = KeyedVectors.load_word2vec_format(filename, binary=False) word_vec = model.wv #word_vec.get_vector('cab') # In[4]: x = word_vec.get_vector('pub') #x.shape # In[6]: import numpy as np cab_file = open('intents/cab.dat', 'r') stopwords = open('intents/stopwords.txt', 'r')
transform = TfidfTransformer() Y = transform.fit_transform(X) # 这里的输入是上面文档的计数矩阵 print(Y.toarray()) # 输出转换为tf-idf后的 Y 矩阵 """ # print(content_train_src) EMBEDDING_DIM = 200 #词向量长度 EMBEDDING_length = 8824330 word2vec_path = '/public/ycdswork/dnswork/glove/Tencent_AILab_ChineseEmbedding.txt' stopwords_path = "/public/ycdswork/dnswork/stopwords/cn_stopwords.txt" webfilepath = "/public/ycdswork/dnswork/httpwebdata/" file_dir = "/home/yangc/myclass/" modelsave_path = "/public/ycdswork/modeldir/LSTMmodel" tc_wv_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=False) # EMBEDDING_length = 8824330 EMBEDDING_length = len(tc_wv_model.key_to_index) print('Found %s word vectors.' % EMBEDDING_length) embeddings_index = {} embedding_matrix = np.zeros((EMBEDDING_length + 1, EMBEDDING_DIM)) # tc_wv_model.key_to_index # for counter, key in enumerate(tc_wv_model.vocab.keys()): for counter, key in enumerate(tc_wv_model.key_to_index): # print(counter,key) embeddings_index[key] = counter + 1 coefs = np.asarray(tc_wv_model[key], dtype='float32') embedding_matrix[counter + 1] = coefs del tc_wv_model
'articleType': 'AIDaily', 'method': 'zh_NER_TF', 'contentMode': [1, 1, 0], 'useExpanded': [1, 0, 1], 'similarity': 50, 'title_weight': 0.8, 'cut_method': 'tfidf', 'top_k': 8, 'normalize_title_content': True, 'file': "./media/data.json", 'use': False } # 向model输入关键词, 输出与其最近的领域词,记录它的大小 # 导入tencent 词向量 model = KeyedVectors.load_word2vec_format( './model/dictionary/Tencent_AILab_ChineseEmbedding.txt', binary=False) def remove_text(text, type='number'): """ 从文本中移除特定文本,例如数字或标点 :param text: 文本 :param type: 移除的文本类型, 可选'number', 'punc', 'both' :return: 移除后的文本 """ from zhon.hanzi import punctuation import string text = str(text) #text = re.sub("<>".format(punctuation, string.punctuation), " ", text) text = re.sub('<.*?>', '', text)
def get_model(): ''' :return: Downloads the `gensim` model.''' return KeyedVectors.load_word2vec_format(download(), binary=False)
import datetime import matplotlib.pyplot as plt import pickle as pkl import gzip # File paths TRAIN_CSV = 'data/train.csv' TEST_CSV = 'data/test.csv' EMBEDDING_FILE = 'data/GoogleNews-vectors-negative300.bin.gz' PROCESSED_DATA_FILE = 'data/processed_data.pkl.gz' MODEL_FILE = 'model/sensim_adadelta_model_weights.h5' # Load training and test set train_df = pd.read_csv(TRAIN_CSV) test_df = pd.read_csv(TEST_CSV) word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True) embedding_dim = 300 embeddings, vocabulary = init_data_as_vectors(word2vec, [train_df, test_df], embedding_dim) del word2vec max_seq_length = max( train_df.question1.map(lambda x: len(x)).max(), train_df.question2.map(lambda x: len(x)).max(), test_df.question1.map(lambda x: len(x)).max(), test_df.question2.map(lambda x: len(x)).max()) # save processed data data = {
def pega_dados(vecfile, target, ant, syn): import csv from gensim.models import KeyedVectors cosine_ant = [] cosine_syn = [] subcos_ant = [] subcos_syn = [] mod = KeyedVectors.load_word2vec_format("/home/bthalenberg/ic/novos novos/"+vecfile, binary=False) i = 0 while i != len(target): #getting cosine similary between target and antonym try: cos = mod.similarity(target[i], ant[i]) except KeyError: cos = None cosine_ant.append(cos) #getting cosine similary between target and synonym try: cos_s = mod.similarity(target[i], syn[i]) except KeyError: cos_s = None cosine_syn.append(cos_s) #subtracting the antonym cosine similarity from the synonym similarity for syn input try: subcos_syn.append(cos_s - cos) except TypeError: subcos_syn.append(None) #negating subtracted values for ant input try: subcos_ant.append(-(cos_s - cos)) except TypeError: subcos_ant.append(None) i += 1 dirname = vecfile[:-4] with open(dirname+"/db_ant.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], ant[i], cosine_ant[i]]) i += 1 with open(dirname+"/db_syn.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], syn[i], cosine_syn[i]]) i += 1 with open(dirname+"/db_sub_ant.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], ant[i], subcos_ant[i]]) i += 1 with open(dirname+"/db_sub_syn.csv", "w", encoding="utf-8") as f: writer = csv.writer(f) i = 0 while i != len(target): writer.writerow([target[i], syn[i], subcos_syn[i]]) i += 1
from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec, KeyedVectors model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/bio_files/PubMed-shuffle-win-30.bin", binary=True) def words_to_vector(words): try: vector = sum(model.get_vector(w.text) for w in words)/len(words) except Exception as e: #print(e, type(words), words) pass return vector def get_cosine_similarity(term, entity): cosine_similarities = [] try: cosine_similarities.append(model.cosine_similarities(words_to_vector(entity[0]), [words_to_vector(term[0])])[0]) except Exception as e: pass for synonym in term[1]['synonyms']: try: cosine_similarities.append( model.cosine_similarities(words_to_vector(entity[0]), [words_to_vector(synonym)])[0]) except Exception as e: pass return max(cosine_similarities)
from gensim.models import Word2Vec, KeyedVectors # lst=[['hello', 'this', 'is', 'the', 'sample', 'text']] # # sentences = gensim.models.word2vec.LineSentence("new_fol.txt") # # model = gensim.models.Word2Vec() # model.build_vocab(lst, min_count=1) # model.train(lst, epochs=model.epochs, total_examples=model.corpus_count) model=Word2Vec.load('thousand.txt') # print(tmp) cv=['trichy', 'chennai', 'gokul', 'klm', 'fog', 'mist', 'cloud', 'google', 'fb'] model1=KeyedVectors.load('thousand.txt') lstt=[['trichy', 'chennai', 'gokul', 'klm', 'fog', 'mist', 'cloud', 'google', 'fb']] model.build_vocab(lstt, update=True) model.train(lstt, epochs=model.epochs, total_examples=model.corpus_count) tmp=0 for i in cv: t=model.wv.get_vector(i) tmp=tmp+t import numpy as np model_word_vector = np.array( tmp, dtype='f') print(model.most_similar([model_word_vector],[],topn=20000)) print(model.most_similar(positive=cv, negative=[], topn=1)) x=model.similar_by_word('issu',topn=1000, restrict_vocab=None)
print("creating word sequences...") ws, ys = [], [] fin = codecs.open(INPUT_FILE, "r", encoding='utf-8') for line in fin: label, sent = line.strip().split("\t") ys.append(int(label)) words = [x.lower() for x in nltk.word_tokenize(sent)] wids = [word2index[word] for word in words] ws.append(wids) fin.close() W = pad_sequences(ws, maxlen=maxlen) Y = np_utils.to_categorical(ys) # GloVe 벡터 불러오기 print("loading word2vec vectors...") word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True) print("transferring embeddings...") X = np.zeros((W.shape[0], EMBED_SIZE)) for i in range(W.shape[0]): E = np.zeros((EMBED_SIZE, maxlen)) words = [index2word[wid] for wid in W[i].tolist()] for j in range(maxlen): try: E[:, j] = word2vec[words[j]] except KeyError: pass X[i, :] = np.sum(E, axis=1) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y,
import pandas as pd import numpy as np import math import os import itertools from tqdm import tqdm import csv import re import multiprocessing as mp from joblib import Parallel, delayed from gensim.models import KeyedVectors server_path = './' model = KeyedVectors.load_word2vec_format('./word2vec.6B.300d.txt', binary=False) global wd_labels wd_prop_label_path = './' # def parallelize(row, prefixes, df_GT): # global df_ling # if row[2] <= 0.0: # return # pred1 = [prefix+row[0] for prefix in prefixes] # pred2 = [prefix+row[1] for prefix in prefixes] # for i, j in itertools.product(pred1, pred2): # if len(df_GT.loc[(df_GT['predE'] == i) & (df_GT['predC'] == j)]) > 0: # df_ling = df_ling.append({'predE': i, 'predC': j, 'cosine_sim': row[2]}, ignore_index=True) # if len(df_GT.loc[(df_GT['predE'] == j) & (df_GT['predC'] == i)]) > 0: # df_ling = df_ling.append({'predE': j, 'predC': i, 'cosine_sim': row[2]}, ignore_index=True) # if len(df_GT.loc[(df_GT['predE'] == i+'_inv') & (df_GT['predC'] == j)]) > 0:
def main(emb_path='glove.6B.100d.txt', data_path='data/msdialogue/'): device = 'cpu' if torch.cuda.is_available(): device = 'cuda' print(f'DEVICE : {device}') params = {'batch_size': 128, 'shuffle': True} # 1) Data loading # # Пока так для дебага # X, y = load_from_json(data_path) # # 1. One-Hot Encode # labels = {'O': 0, 'FQ': 1, 'IR': 2, # 'OQ': 3, 'GG': 4, 'FD': 5, # 'JK': 6, 'NF': 7, 'PF': 8, # 'RQ': 9, 'CQ': 10, 'PA': 11} # y_train = [] # for l in y: # l = l.split('_') # cur_y = [0] * len(labels) # for un_l in l: # cur_y[labels[un_l]] = 1 # y_train.append(cur_y) # y_train = torch.tensor(y_train) # # 2. Нужный вид # X_train = [] # for i in range(len(X)): # for j in range(len(X[i])): # X_train.append(X[i][j]) print('Building Embedding') if emb_path == 'glove.6B.100d.txt': tmp_file = get_tmpfile("test_word2vec.txt") _ = glove2word2vec(emb_path, tmp_file) word2vec = KeyedVectors.load_word2vec_format(tmp_file) else: word2vec = gensim.models.KeyedVectors.load_word2vec_format(emb_path, binary=True) EMB_DIM = word2vec.vectors.shape[1] word2vec.add('<UNK>', np.mean(word2vec.vectors.astype('float32'), axis=0)) word2vec.add('<PAD>', np.array(np.zeros(EMB_DIM))) tokenizer = Vocab() tokenizer.build(word2vec) print('Loading Data') X_train = pd.read_csv(data_path + "train.tsv", sep="\t", header=None, index_col=None) y_train = encode_label(X_train[0].to_numpy()) X_train = tokenizer.tokenize(X_train[1].to_numpy(), max_len=MAX_SEQ_LEN) X_val = pd.read_csv(data_path + "valid.tsv", sep="\t", header=None, index_col=None) y_val = encode_label(X_val[0].to_numpy()) X_val = tokenizer.tokenize(X_val[1].to_numpy(), max_len=MAX_SEQ_LEN) X_test = pd.read_csv(data_path + "test.tsv", sep="\t", header=None, index_col=None) y_test = encode_label(X_test[0].to_numpy()) X_test = tokenizer.tokenize(X_test[1].to_numpy(), max_len=MAX_SEQ_LEN) # 2. padding pad_val = tokenizer.get_pad() X_train = pad_sequence( X_train, batch_first=True, padding_value=pad_val).to( torch.long)[1:, :MAX_SEQ_LEN] # size: tensor(batch, max_seq_len) X_val = pad_sequence(X_val, batch_first=True, padding_value=pad_val).to( torch.long)[1:, :MAX_SEQ_LEN] X_test = pad_sequence(X_test, batch_first=True, padding_value=pad_val).to( torch.long)[1:, :MAX_SEQ_LEN] # 3) Batch iterator training = data.DataLoader(MSDialog(X_train, y_train), **params) validation = data.DataLoader(MSDialog(X_val, y_val), **params) testing = data.DataLoader(MSDialog(X_test, y_test), **params) # 4) Model, criterion and optimizer model = BaseCNN(word2vec, tokenizer.get_pad(), emb_dim=EMB_DIM).to(device) optimizer = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08) criterion = nn.BCELoss() # 5) training process treshold = 0.5 print('Train') # for X, y in training: # X, y = X.to(device), y.to(device) # break for ep in range(N_EPOCHS): if ep == 10: optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08) print(f'epoch: {ep}') # j = 0 # # model.train() # losses = [] # for i in range(50): # optimizer.zero_grad() # output = model(X) # loss = torch.tensor(0.0).to(output) # for i in range(output.shape[1]): # criterion = nn.BCELoss() # loss += criterion(output[:, i].unsqueeze(1), y[:, i].unsqueeze(1).to(torch.float32)) # losses.append(float(loss.cpu())/output.shape[1]) # loss.backward() # optimizer.step() # # print(f'iter: {j}, loss: {loss}') # j += 1 # print(f'train loss={np.mean(losses)}') j = 0 model.train() losses = [] for X, y in training: optimizer.zero_grad() X, y = X.to(device), y.to(device) output = model(X) loss = torch.tensor(0.0).to(output) for i in range(output.shape[1]): criterion = nn.BCELoss() loss += criterion(output[:, i].unsqueeze(1), y[:, i].unsqueeze(1).to(torch.float32)) loss.backward() losses.append(float(loss.cpu())) optimizer.step() # print(f'iter: {j}, loss: {loss}') j += 1 print(f'train loss={np.mean(losses)}') with torch.no_grad(): model.eval() # print('EVALUATION________') losses = [] f1_scores = [] precisions = [] recalls = [] accuracies = [] for X, y in validation: criterion = nn.MultiLabelSoftMarginLoss() X, y = X.to(device), y.to(device) output = model(X) loss = torch.tensor(0.0).to(output) for i in range(output.shape[1]): criterion = nn.BCELoss() loss += criterion(output[:, i].unsqueeze(1), y[:, i].unsqueeze(1).to(torch.float32)) losses.append(float(loss.cpu())) output = output.cpu().numpy() for i in range(len(output)): pred = output[i] > treshold if sum(pred) == 0: pred = output[i].max(axis=0, keepdims=1) == output[i] output[i] = pred precisions.append(get_f1(y, output)[0]) recalls.append(get_f1(y, output)[1]) f1_scores.append(get_f1(y, output)[2]) accuracies.append(get_accuracy(y, output)) print('VAL:') print(f'val_loss={np.mean(losses)}') print(f'accuracy={np.mean(accuracies)}') print(f'precision={np.mean(precisions)}') print(f'recall={np.mean(recalls)}') print(f'f1-score={np.mean(f1_scores)}') print('__________________') torch.save(model.state_dict(), SAVE_PATH)
default=False, dest='double_cycle', help='Use double cycle') ap.add_argument('-man', type=str, default=None) ap.add_argument('-woman', type=str, default=None) ap.add_argument('-king', type=str, default=None) args = vars(ap.parse_args()) color = args['color'] word = args['word'] man = args['man'] woman = args['woman'] king = args['king'] double_cycle = args['double_cycle'] word_vectors = KeyedVectors.load_word2vec_format(args['vectors'], binary=False, unicode_errors='ignore') print("vectors loaded") def solid_shape(dim:int, color:str, word:str): """Draws a solid shape based on word for given vector . Better with 50 dimensions or less.""" se = word_vectors[word] r_s = [] thetas = [] for x in range(0, dim): thetas.append(2 * np.pi * x / dim) r_s.append(se[x]) thetas.append(2 * np.pi * 0 / dim) r_s.append(se[0]) data = [go.Scatterpolar(r=r_s, theta=thetas, thetaunit="radians", mode='lines', marker=dict(color='peru'),
def main(): # 读取命令行参数 config_file = 'config/story_config.ini' switch = ['server_pmr_clf', 'testing', '2_17'] parser = argparse.ArgumentParser() config = ConfigParser() config.read(config_file) parser.add_argument('--ref_file', default='result/ref_' + switch[0] + '_' + switch[2] + '.txt', help='self_test') parser.add_argument('--hypo_file', default='result/hypo_' + switch[0] + '_' + switch[2] + '.txt', help='self_test') parser.add_argument('--self_test', default=False, help='self_test') parser.add_argument('--test_story', default=False, help='self_test') parser.add_argument('--config_file', default=config_file, type=str, help='Select cuda number') parser.add_argument('--switch', default=switch, type=str, help='Select cuda number') parser.add_argument('--use_cuda', default=config.getboolean(switch[0], 'use_cuda'), type=str, help='Select cuda number') parser.add_argument('--device', default=config.get(switch[0], 'device'), type=str, help='Select cuda number') parser.add_argument('--gpu_para', action='store_true', default=config.getboolean(switch[0], 'gpu_para'), help='Whether load checkpoint') # gpu parallel parser.add_argument('--log_path', default=config.get(switch[0], 'log_path').format( switch[1], switch[0], switch[2]), type=str, required=False, help='训练日志存放位置') parser.add_argument( '--data_path', default=config.get(switch[0], 'data_path'), help='load data file path' ) # train_sen_char_idx / train_gpt2_idx_12_24 / train_plutchik_12_26 parser.add_argument('--raw_data_path', default=config.get(switch[0], 'raw_data_path'), help='load data file path') parser.add_argument('--num_epochs', type=int, default=config.getint(switch[0], 'num_epochs'), help='num_epochs') parser.add_argument('--seed', type=int, default=config.getint(switch[0], 'seed'), help='设置种子用于生成随机数,以使得训练的结果是确定的') # None parser.add_argument('--batch_size', type=int, default=config.getint(switch[0], 'batch_size'), help='number of batch_size') # batch_size parser.add_argument('--num_workers', type=int, default=config.getint(switch[0], 'num_workers'), help='number of workers') parser.add_argument('--lr', type=float, default=config.getfloat(switch[0], 'lr'), help='size of learning rate') parser.add_argument('--dropout', type=float, default=config.getfloat(switch[0], 'dropout'), help='size of dropout') parser.add_argument('--max_grad_norm', type=float, default=config.getfloat(switch[0], 'max_grad_norm'), help='size of dropout') # 1. / 5. parser.add_argument('--embedding_dim', type=int, default=config.getint(switch[0], 'embedding_dim'), help='embedding_dim') # 128 / 768 / 50 / 300 parser.add_argument('--hidden_size', type=int, default=config.getint(switch[0], 'hidden_size'), help='hidden_size') parser.add_argument('--max_oovs', type=int, default=config.getint(switch[0], 'max_oovs'), help='number of max_oovs') parser.add_argument('--char_num', type=int, default=config.getint(switch[0], 'char_num'), help='number of character') parser.add_argument('--pmr_size', type=int, default=config.getint(switch[0], 'pmr_size'), help='number of pmr') parser.add_argument('--p_size', type=int, default=config.getint(switch[0], 'p_size'), help='number of plutchik') parser.add_argument('--m_size', type=int, default=config.getint(switch[0], 'm_size'), help='number of maslow') parser.add_argument('--r_size', type=int, default=config.getint(switch[0], 'r_size'), help='number of reiss') parser.add_argument('--embed', default=config.get(switch[0], 'embed'), help='Select 50d or 300d embedding file path') parser.add_argument('--word_dict', default=config.get(switch[0], 'word_dict'), help='Select word_dict file path') parser.add_argument('--glove', action='store_true', default=config.getboolean(switch[0], 'glove'), help='Whether use glove') # model parser.add_argument('--opt', action='store_true', default=config.getboolean(switch[0], 'opt'), help='Select Adam or SGD optimizer. True is Adam') parser.add_argument('--gpt2', action='store_true', default=config.getboolean(switch[0], 'gpt2'), help='Whether use gpt2') parser.add_argument('--bigru', action='store_true', default=config.getboolean(switch[0], 'bigru'), help='Whether use bigru') parser.add_argument('--bilstm', action='store_true', default=config.getboolean(switch[0], 'bilstm'), help='Whether use bilstm') parser.add_argument('--gate', action='store_true', default=config.getboolean(switch[0], 'gate'), help='Whether use gate mechanism') parser.add_argument('--copy', action='store_true', default=config.getboolean(switch[0], 'copy'), help='Whether use copy mechanism') parser.add_argument('--teacher_force', action='store_true', default=config.getboolean(switch[0], 'teacher_force'), help='Whether use teacher force') # pmr & char parser.add_argument('--fix_encoder', action='store_true', default=config.getboolean(switch[0], 'fix_encoder')) parser.add_argument('--encoder_merge', action='store_true', default=config.getboolean(switch[0], 'encoder_merge')) parser.add_argument('--baseline', action='store_true', default=config.getboolean(switch[0], 'baseline')) parser.add_argument('--fix_decoder', action='store_true', default=config.getboolean(switch[0], 'fix_decoder')) parser.add_argument('--psy_clf', action='store_true', default=config.getboolean(switch[0], 'psy_clf')) parser.add_argument('--seq_attn', action='store_true', default=config.getboolean(switch[0], 'seq_attn')) parser.add_argument('--context', action='store_true', default=config.getboolean(switch[0], 'context'), help='Whether add context') parser.add_argument('--only_plutchik', action='store_true', default=config.getboolean(switch[0], 'only_plutchik'), help='Whether add pmr_input') parser.add_argument('--dynamic', action='store_true', default=config.getboolean(switch[0], 'dynamic'), help='Whether add pmr_input') parser.add_argument('--pmr_input', action='store_true', default=config.getboolean(switch[0], 'pmr_input'), help='Whether add pmr_input') parser.add_argument('--rep_inp_attn', action='store_true', default=config.getboolean(switch[0], 'rep_inp_attn'), help='Whether add rep_inp_attn') parser.add_argument('--pmr_attn', action='store_true', default=config.getboolean(switch[0], 'pmr_attn'), help='Whether use pmr_attn') parser.add_argument('--char_attn', action='store_true', default=config.getboolean(switch[0], 'char_attn'), help='Whether use char_attn') # load & save model parser.add_argument('--load_ckpt', action='store_true', default=config.getboolean(switch[0], 'load_ckpt'), help='Whether load checkpoint') # load checkpoint parser.add_argument('--save_ckpt', action='store_true', default=config.getboolean(switch[0], 'save_ckpt'), help='Whether save checkpoint') # save checkpoint parser.add_argument('--load_ckpt_file', default=config.get(switch[0], 'load_ckpt_file').format( switch[2], switch[0]), help='Set checkpoint file path') # ckpt_path parser.add_argument('--ckpt_path', default=config.get(switch[0], 'ckpt_path').format(switch[2]), help='Set checkpoint file path') # ckpt_path parser.add_argument('--ckpt_file', default=config.get(switch[0], 'ckpt_file').format(switch[0]), help='Set checkpoint file name') # ckpt_file args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # torch.backends.cudnn.deterministic = True global logger logger = create_logger(args) logger.info('start game!') logger.info('switch: {}'.format(switch)) logger.info(args) if args.use_cuda: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = torch.device('cpu') logger.info(device) if args.gpt2 == False: # 加载word_dict with open(args.word_dict) as f: for line in f: word_dict = json.loads(line) if args.glove: # inv_dict = {v: k for k, v in word_dict.items()} # 加载glove预训练词向量 logger.info('loading: {}'.format(args.embed)) tmp_word2vec = args.embed glove_model = KeyedVectors.load_word2vec_format(tmp_word2vec) args.embedding_dim = glove_model.vector_size embedding_matrix = np.zeros( (len(word_dict), args.embedding_dim)) # [vocab_size,embedding_dim] for i in range(len(word_dict)): embedding_matrix[i, :] = glove_model[glove_model.index2word[i]] embedding_matrix = torch.from_numpy(embedding_matrix).float().to( device) else: embedding_matrix = None p_np, m_np, r_np = get_pmr(args, word_dict) # (32, 1) vocab_size = len(word_dict) # model model = PMRClf(args, device, embedding_matrix, vocab_size, word_dict, args.embedding_dim, args.hidden_size, dropout=args.dropout) if args.use_cuda and args.gpu_para: # model = nn.DataParallel(model, device_ids=[0, 1]) # multi-GPU model = nn.DataParallel(model, device_ids=[ int(i) for i in args.device.split(',') ]) # multi-GPU torch.backends.cudnn.benchmark = True model = model.to(device) model.load_state_dict(torch.load(args.load_ckpt_file + '.pkl')) logger.info('loading checkpoint file {}'.format(args.load_ckpt_file)) dataset = MyDataset_clf(args.data_path) train_loader, dev_loader = train_test_split(dataset, test_size=0.1, random_state=1) logger.info("loading {} data".format('dev_loader')) inv_dict = {v: k for k, v in word_dict.items()} test(args, model, dev_loader, inv_dict, word_dict, device)
#!/usr/bin/python # -*- coding: utf-8 -*- from gensim.models import KeyedVectors model = KeyedVectors.load_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec', binary=False) model.save_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec.bin', binary=True)
def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
"""Script to rewrite Google's word2vec into a format that loads faster.""" import os from gensim.models import KeyedVectors if __name__ == "__main__": path = os.path.join("data", "GoogleNews-vectors-negative300.bin.gz") w2v = KeyedVectors.load_word2vec_format(path, binary=True) w2v.init_sims(replace=True) w2v.save(path)
def load_google_vec(): from gensim.models import KeyedVectors return KeyedVectors.load_word2vec_format('~/nlp/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)
from gensim.models import KeyedVectors en_vectors = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec', binary=False) from gensim.models import Word2Vec vi_vectors = Word2Vec.load('data/vi.bin').wv # # Lưu ý: đối với model glove, cần chuyển về format word2vec # # Ví dụ # from gensim.scripts.glove2word2vec import glove2word2vec # glove2word2vec('data/glove.6B.50d.txt', 'data/en.vec') en_vectors.vocab en_vectors["cat"] print ("vector size: ", en_vectors.vector_size) print ("vocab size: ", len(en_vectors.vocab)) print ("vector size: ", vi_vectors.vector_size) print ("vocab size: ", len(vi_vectors.vocab)) en_vectors.most_similar("cat") vi_vectors.most_similar("mèo") sim_words = en_vectors.most_similar(positive=['queen', 'man'], negative=['king']) print('Queen is a: ', sim_words[0][0]) sim_words = en_vectors.most_similar(negative=['king'], positive=['kings', 'queen'])
import numpy as np import gensim import pymorphy2 from sklearn.neighbors import KNeighborsClassifier from functools import lru_cache from gensim.models import KeyedVectors import pickle import os import re model = KeyedVectors.load('my_model') morph = pymorphy2.MorphAnalyzer() clf_file = 'trained_knn.clf' clf = None @lru_cache(maxsize=10000) def get_normal_form(i): return morph.normal_forms(i)[0] def normalize_text(x): return ' '.join([get_normal_form(i) for i in re.findall('\w+', x)]) def get_question_vector(question): question_vect = np.zeros(300) try: for word in re.findall('\w+', question): question_vect += model.wv.__getitem__(word)
def _get_embedding(self, embedding_path): model = KeyedVectors.load_word2vec_format(embedding_path) vocab = model.vocab vocab_len = len(vocab) return np.array([model.word_vec(k) for k in vocab.keys()])
def loadGoogleModel(self, file_name): self.model = KeyedVectors.load_word2vec_format(file_name, binary=True) self.is_w2v = True
def do_keras_textcnn_w2v(text,stars,trainable): #转换成词袋序列 max_document_length=200 embedding_dims = 300 #获取已经训练好的词向量 model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True) print model['word'].shape #设置分词最大个数 即词袋的单词个数 tokenizer = Tokenizer(num_words=max_features,lower=True) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) x=pad_sequences(sequences, maxlen=max_document_length) #我们可以使用从scikit-learn LabelEncoder类。 # 这个类通过 fit() 函数获取整个数据集模型所需的编码,然后使用transform()函数应用编码来创建一个新的输出变量。 encoder=LabelEncoder() encoder.fit(stars) encoded_y = encoder.transform(stars) #labels = to_categorical(np.asarray(labels))也可以进行数据处理 #获取word到对应数字编号的映射关系 word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) #获取词向量的映射矩阵 embedding_matrix = np.zeros((max_features + 1, embedding_dims)) for word, i in word_index.items(): #编号大于max_features的忽略 该字典是按照字典顺序 所以对应的id不一定是顺序的 if i > max_features: continue try: embedding_matrix[i] = model[word].reshape(embedding_dims) except: print "%s not found!" % (word) #构造神经网络 def baseline_model(): #CNN参数 #filters个数通常与文本长度相当 便于提取特征 filters = max_document_length # Inputs input = Input(shape=[max_document_length]) # 词向量层,本文使用了预训练word2vec词向量,把trainable设为False x = Embedding(max_features + 1, embedding_dims, weights=[embedding_matrix], trainable=trainable)(input) # conv layers convs = [] for filter_size in [3,4,5]: l_conv = Conv1D(filters=filters, kernel_size=filter_size, activation='relu')(x) l_pool = MaxPooling1D()(l_conv) l_pool = Flatten()(l_pool) convs.append(l_pool) merge = concatenate(convs, axis=1) out = Dropout(0.2)(merge) output = Dense(32, activation='relu')(out) output = Dense(units=2, activation='softmax')(output) #输出层 model = Model([input], output) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #可视化 plot_model(model, to_file='yelp-cnn-model-textcnn.png',show_shapes=True) model.summary() return model #在 scikit-learn 中使用 Keras 的模型,我们必须使用 KerasClassifier 进行包装。这个类起到创建并返回我们的神经网络模型的作用。 # 它需要传入调用 fit()所需要的参数,比如迭代次数和批处理大小。 # 最新接口指定训练的次数为epochs clf = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=50, verbose=1) #使用5折交叉验证 scores = cross_val_score(clf, x, encoded_y, cv=5, scoring='f1_micro') # print scores print("f1_micro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
for filename in glob.glob(os.path.join(negative_path, '*.txt')): with open(filename, 'r') as f: dataset.append((neg_label, f.read())) shuffle(dataset) return dataset # In[2]: from nltk.tokenize import TreebankWordTokenizer from gensim.models import KeyedVectors word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000) def tokenize_and_vectorize(dataset): tokenizer = TreebankWordTokenizer() vectorized_data = [] expected = [] for sample in dataset: tokens = tokenizer.tokenize(sample[1]) sample_vecs = [] for token in tokens: try: sample_vecs.append(word_vectors[token]) except KeyError: pass # No matching token in the Google w2v vocab
def get_model(): ''' :return: Downloads the `gensim` model.''' return KeyedVectors.load_word2vec_format(download(),binary=False)
import random import tensorflow as tf import numpy as np from gensim.models import KeyedVectors from subjectivity.utils import get_data_from_list from subjectivity.utils import is_objective, is_subjective _bucket_size = 10 _path = os.path.dirname(__file__) _saving_dir = os.path.join(_path, '../data/save') _subjective_filename = os.path.join( _path, '../data/subj_dataset/subjective_test.txt') _objective_filename = os.path.join(_path, '../data/subj_dataset/objective_test.txt') _model = KeyedVectors.load_word2vec_format( os.path.join(_path, '../data/word_embeddings/glove.6B.50d.txt')) def count_true_and_false_positives_and_negatives(prediction, expected): true_positives = sum([ prediction[i] == is_subjective and expected[i] == is_subjective for i in range(len(expected)) ]) false_positives = sum([ prediction[i] == is_subjective and expected[i] != is_subjective for i in range(len(expected)) ]) true_negatives = sum([ prediction[i] != is_subjective and expected[i] != is_subjective for i in range(len(expected)) ])
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer import matplotlib.pyplot as plt from gensim.models import Word2Vec from gensim.test.utils import datapath from gensim.models import KeyedVectors from sklearn.metrics import confusion_matrix import itertools from sklearn.model_selection import train_test_split import sys if (len(sys.argv) != 3): print("Usage: script.py <full path to w2v> <path to dataset>") print("importing word2vec") wv_from_bin = KeyedVectors.load_word2vec_format(datapath(sys.argv[1]), binary=True) # C binary format print("imported word2vec") df = pd.read_csv(sys.argv[2], sep=",", index_col=0, header=0, names=["body", "isAdHominem"]) train, test = train_test_split(df, test_size=0.3, random_state=3) print("In total, the train contains", sum(train["isAdHominem"] == True), "ad hominems") print("In total, the test contains", sum(test["isAdHominem"] == True), "ad hominems")
#model = Word2Vec.load(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model") import time start = time.time() ####from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors ####from gensim.scripts.glove2word2vec import glove2word2vec #### #### #####glove_file = datapath(r'C:\Users\Colouree\Desktop\Colouree\glove.840B.300d.txt') #####tmp_file = get_tmpfile(r"glove.840B.300d_word2vec.txt") #####_ = glove2word2vec(glove_file, tmp_file) #####model = KeyedVectors.load_word2vec_format(tmp_file) #####model.save(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model") #model=KeyedVectors.load(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model") model1 = KeyedVectors.load( r"C:\Users\Colouree\Desktop\Colouree\google_word2vec.model") print("took {} secs to load the model".format(time.time() - start)) #start = time.time() #def compare_two_words(tag1,tag2): # result=model.similarity(tag1, tag2) # return result final_tags = [] for word in keys: x = '' for ij in word.split(): if ij in model1.vocab: x += ij + ' ' if not x == '': final_tags.append(x) import pandas as pd
Frequency threshold = 10 2000 iterations """ # Word frequency = 10 frequency_threshold = 10 # Exclude stopwords stop_words = set(stopwords.words('english')) # Load embedding model glove_file = datapath('/Users/jonabenja/Desktop/glove.twitter.27B/glove.twitter.27B.200d.txt') tmp_file = get_tmpfile('test_word2vec.txt') wordembeddings = glove2word2vec(glove_file, tmp_file) word_embedding_model = KeyedVectors.load_word2vec_format(tmp_file) # This model has 200 dimensions so we set the number of features to 200 num_features = 200 # Transform training data to use filepath = 'data/MELD/train_sent_emo.csv' dftrain = pd.read_csv(filepath) dftrain['Utterance'] = dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'") filepath = './data/MELD/test_sent_emo.csv' dftest = pd.read_csv(filepath) dftest['Utterance'] = dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'") training_instances = tokenize_data(dftrain['Utterance'])
pad_id = word_to_token_map["<pad>"] # 对句子进行token转换,对于未在词典中出现过的词用unk的token填充 tokens = [word_to_token_map.get(word, unk_id) for word in sentence] if len(tokens) < limit_size: #补齐 tokens.extend([0] * (limit_size - len(tokens))) else: #截断 tokens = tokens[:limit_size] return tokens x_data = [convert_text_to_token(sentence) for sentence in x] x_data = np.array(x_data) wvmodel = KeyedVectors.load_word2vec_format('word60.vector') static_embeddings = np.zeros([VOCAB_SIZE, EMBEDDING_SIZE]) for word, token in tqdm(a.items()): if word in wvmodel.vocab.keys(): static_embeddings[token, :] = wvmodel[word] elif word == '<pad>': static_embeddings[token, :] = np.zeros(EMBEDDING_SIZE) else: static_embeddings[ token, :] = 0.2 * np.random.random(EMBEDDING_SIZE) - 0.1 print(static_embeddings.shape) X_train, X_test, y_train, y_test = train_test_split(x_data, y, test_size=0.3)
num_dense = np.random.randint(100, 150) rate_drop_lstm = 0.15 + np.random.rand() * 0.25 rate_drop_dense = 0.15 + np.random.rand() * 0.25 act = 'relu' re_weight = True # whether to re-weight classes to fit the 17.5% share in test set STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \ rate_drop_dense) ######################################## ## index word vectors ######################################## print('Indexing word vectors') word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \ binary=True) print('Found %s word vectors of word2vec' % len(word2vec.vocab)) ######################################## ## process texts in datasets ######################################## print('Processing text dataset') # The function "text_to_wordlist" is from # https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords:
# 事前学習済みの単語ベクトル(例えば,Google Newsデータセット(約1,000億単語)での学習済み単語ベクトル)で単語埋め込みemb(x)を初期化し,学習せよ. from gensim.models import KeyedVectors modelvec = KeyedVectors.load_word2vec_format("/content/drive/My Drive/2020年度/勉強会/GoogleNews-vectors-negative300.bin", binary=True) # 学習済み単語ベクトルの取得 VOCAB_SIZE = len(set(ids)) + 1 EMB_SIZE = 300 weights = np.zeros((VOCAB_SIZE, EMB_SIZE)) words_in_pretrained = 0 for i, word in enumerate(ids.keys()): try: weights[i] = modelvec[word] words_in_pretrained += 1 except KeyError: weights[i] = np.random.normal(scale=0.4, size=(EMB_SIZE,)) weights = torch.from_numpy(weights.astype((np.float32))) print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}') print(weights.size()) class RNN(nn.Module): def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False): super().__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.num_directions = bidirectional + 1 # 単方向:1、双方向:2 if emb_weights != None: # 指定があれば埋め込み層の重みをemb_weightsで初期化 self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx) else: self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
continue X = np.array(X)/np.linalg.norm(X) Y = np.array(Y)/np.linalg.norm(Y) o = np.dot(X, Y.T)/np.linalg.norm(X)/np.linalg.norm(Y) scores.append(o) scores = np.asarray(scores) return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('ground_truth', help="ground truth text file, one example per line") parser.add_argument('predicted', help="predicted text file, one example per line") parser.add_argument('embeddings', help="embeddings bin file") args = parser.parse_args() print("loading embeddings file...") w2v = KeyedVectors.load_word2vec_format(args.embeddings, binary=True) r = average(args.ground_truth, args.predicted, w2v) print("Embedding Average Score: %f +/- %f ( %f )" %(r[0], r[1], r[2])) r = greedy_match(args.ground_truth, args.predicted, w2v) print("Greedy Matching Score: %f +/- %f ( %f )" %(r[0], r[1], r[2])) r = extrema_score(args.ground_truth, args.predicted, w2v) print("Extrema Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
from gensim.models import KeyedVectors model = KeyedVectors.load_word2vec_format( './data/GoogleNews-vectors-negative300.bin.gz', binary=True ) # 国名の取得 countries = set() with open('data/analogy_data_add.txt', 'r') as f: for line in f: line = line.split() if line[0] in ['capital-common-countries', 'capital-world']: countries.add(line[2]) elif line[0] in ['currency', 'gram6-nationality-adjective']: countries.add(line[1]) countries = list(countries) # 単語ベクトルの取得 countries_vec = [model[country] for country in countries] from sklearn.cluster import KMeans import numpy as np # k-meansクラスタリング kmeans = KMeans(n_clusters=5) kmeans.fit(countries_vec) for i in range(5): cluster = np.where(kmeans.labels_ == i)[0] print('cluster', i)
from future.utils import iteritems from builtins import range # Note: you may need to update your version of future # sudo pip install -U future from gensim.models import KeyedVectors # warning: takes quite awhile # https://code.google.com/archive/p/word2vec/ # direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing # 3 million words and phrases # D = 300 word_vectors = KeyedVectors.load_word2vec_format( '../large_files/GoogleNews-vectors-negative300.bin', binary=True ) # convenience # result looks like: # [('athens', 0.6001024842262268), # ('albert', 0.5729557275772095), # ('holmes', 0.569324254989624), # ('donnie', 0.5690680742263794), # ('italy', 0.5673537254333496), # ('toni', 0.5666348338127136), # ('spain', 0.5661854147911072), # ('jh', 0.5661597847938538), # ('pablo', 0.5631559491157532), # ('malta', 0.5620371103286743)]
import h5py from keras.preprocessing.text import Tokenizer, text_to_word_sequence from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.optimizers import Adam from keras.layers import Input, LSTM, GlobalMaxPool1D, Dense, Dropout, Embedding from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.models import Model from gensim.models import KeyedVectors # %% word2vec_model = KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin', binary=True, limit=1000000) # %% FILTERS = 60 MAXLEN = 100 MAX_FEAUTURE = 50000 DROPOUT_RATE = 0.1 DENSE_UNITS = 50 EMBED_SIZE = 128 LIST_CLASSES = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] # %% def get_tokenizer(texts):