def runWord2Vec(condition_one, condition_two): num_features = 300 try: model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context") except Exception, e: Create_W2V_model.trainWorld2Vec(num_features) model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context")
def _load_vector_space_mapper(model_1_path, model_2_path, bilingual_path): """Build a vector space mapper from model 1,2 and bilingual dict.""" model_1 = Word2Vec.load(model_1_path) model_2 = Word2Vec.load(model_2_path) bilingual_dict = bg.load_bilingual_dictionary(bilingual_path) tvecs_vm = VectorSpaceMapper(model_1, model_2, bilingual_dict) tvecs_vm.map_vector_spaces() return tvecs_vm
def query(question): model = Word2Vec.load('/home/jcoreyes/news_model') extractor = Rake() words = extractor.run(question) keywords = [words[i][0] for i in xrange(len(words))] return model.most_similar(positive=keywords)[0][0]
def out_of_core_x_normalisation(data_dir=HEP_TRAIN_PATH, batch_size=1024, persist=False): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ doc_generator = get_documents(data_dir=data_dir) word2vec_model = Word2Vec.load(WORD2VEC_MODELPATH) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in xrange(batch_size): try: batch.append(doc_generator.next()) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print "Matrix shape: {}".format(matrix.shape) scaler.partial_fit(matrix) if persist: save_to_disk(SCALER_PATH, scaler) return scaler
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS, verbose=1): model = MagpieModel( word2vec_model=Word2Vec.load(WORD2VEC_PATH), scaler=load_from_disk(SCALER_PATH), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(no_of_labels), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history, model.keras_model, persist=persist) return history, model
def build_model_for_corpus(corpus): """ Build an appropriate Keras NN model depending on the corpus """ if corpus == 'keywords': keras_model = cnn(embedding_size=100, output_length=10000) elif corpus == 'categories': keras_model = cnn(embedding_size=100, output_length=14) elif corpus == 'experiments': keras_model = cnn(embedding_size=100, output_length=500) else: raise ValueError('The corpus is not valid') model_path = os.path.join(DATA_DIR, corpus, 'model.pickle') keras_model.load_weights(model_path) w2v_model = Word2Vec.load(WORD2VEC_PATH) scaler = load_from_disk(SCALER_PATH) labels = get_labels(keras_model.output_shape[1]) model = MagpieModel( keras_model=keras_model, word2vec_model=w2v_model, scaler=scaler, labels=labels, ) return model
def test_ofm_word2vec_cosine_selection(self): model = Word2Vec.load(self.brownFilePath) ofmPredictor = OFMPredictions() testData = self.getOFMTestData() pred = ofmPredictor.word2VecSimilaritySelectionCosine(testData, model) optionSentences = [option['sent'] for option in testData['word1']['options']] self.assertTrue(pred['word1']['solution'] in optionSentences)
def word2vec_model(argument): try: return Word2Vec.load(argument) except: raise ArgumentTypeError( 'Could not read embeddings from {}'.format(argument) )
def mineAbbreviation(): print 'mining abbreviation' jieba.load_userdict("../../../data/jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') word2vec_model = Word2Vec.load('../../../data/word2vec.model') word_set = getWords() word_syn_dict = {} for word in word_set: word_syn_dict.setdefault(word,set([word])) if len(word) != 2: continue try: for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20): simi_word = simi_word_tuple[0] simi_value = simi_word_tuple[1] reverse_word = word[1]+word[0] if reverse_word == simi_word: pass else: if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word: continue word_syn_dict[word].add(simi_word) except: pass # print word outfile = open('abbreviation.txt','wb') for word in word_syn_dict.keys(): if len(word_syn_dict[word])>=2: outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')
def __main__(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description='') parser.add_argument('-f', action='store', dest='filename', help='Data filename') parser.add_argument('-d', action='store', nargs="+", dest='dataset', help='Dataset name') parser.add_argument('-c', action='store', nargs="+", dest='categories', help='Dataset name') parser.add_argument('--topn', action='store', nargs="+", dest='topn', default='0', help='Dataset name') parser.add_argument('--model', action='store', nargs="+", dest='modelname', help='Similarity dictionary name') parser.add_argument('--lda', action='store_true', dest='test_lda', help='If on test lda features') parser.add_argument('--sd', action='store_true', dest='test_simdict', help='knn similarity') parser.add_argument('--w2v', action='store_true', dest='test_w2v', help='If on test w2v features') parser.add_argument('--w2v-topn', action='store_true', dest='test_w2v_topn', help='If on test w2v features') parser.add_argument('--pword', action='store_true', dest='perword', help='whether similar words taken per word') parser.add_argument('--kt', action='store_true', dest='kt', help='kenyan twits') arguments = parser.parse_args() print arguments datasets, filenames = prep_arguments(arguments) topns = map(int, arguments.topn) perword = arguments.perword if arguments.modelname is not None and not arguments.test_simdict: w2v_model_name = arguments.modelname[0] print w2v_model_name w2v_model = Word2Vec.load(w2v_model_name) w2v_model.init_sims(replace=True) else: w2v_model = None for dataset, filename in zip(datasets, filenames): for topn in topns: print dataset, filename, topn test_one_file(filename, dataset, topn, perword, w2v_model, arguments)
def train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, verbose=1): # Figure out whether we're predicting categories or keywords if NO_OF_LABELS == 14: scaler_path = CATEGORY_SCALER w2v_path = CATEGORY_WORD2VEC else: scaler_path = KEYWORD_SCALER w2v_path = KEYWORD_WORD2VEC model = MagpieModel( word2vec_model=Word2Vec.load(w2v_path), scaler=load_from_disk(scaler_path), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(NO_OF_LABELS), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history) return history, model
def __init__(self, model_file_path=''.join(config['model_file_path']), label_file_path=''.join(config['label_file_path']), word2vec_to_solve_oov=config['word2vec_to_solve_oov'] ): ''' :param word2vec_to_solve_oov: 是否使用word2vec去查oov ''' self._word2vec_to_solve_oov = word2vec_to_solve_oov self._model_file_path = model_file_path self._full_mode = config['full_mode'] logging.debug('使用full切词模式?%s...'%(self._full_mode)) logging.debug('=' * 20) logging.debug('加载分类器模型和编码器...') model_in_file = open(model_file_path, 'r') self._model = pickle.load(model_in_file) self._bow_encoder = pickle.load(model_in_file) self._cnn_encoder = pickle.load(model_in_file) self._index_to_label = np.load(open(label_file_path,'r')) self._keywords = self._bow_encoder.get_feature_names() # 测试 logging.debug('=' * 20) logging.debug('测试...') logging.debug('-' * 20) logging.debug('加载word2vec模型...') logging.debug('-' * 20) logging.debug('=' * 20) if config['word2vec_to_solve_oov']: self._word2vec_model = Word2Vec.load(config['word2vec_model_file_path'])
def load(filename): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if filename[:-6] == "bin.gz": model = Word2Vec.load_word2vec_format(filename, binary=True) else: model = Word2Vec.load(filename) return model
def main(nouns_loc, word2vec_loc, n_nouns, out_loc): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Load trained Word2Vec model model = Word2Vec.load(word2vec_loc) logger.info('Word2Vec object loaded') logger.info('Keeping %s nouns', n_nouns) # Empty dictionary for noun to vector mapping noun_to_vect_dict = {} # Counter to know when to stop counter = 0 with open(nouns_loc, 'r') as f: while counter < int(n_nouns): line = make_tuple(f.readline()) # Add noun and vector to mapping dictionary noun = line[0] noun_to_vect_dict[noun] = model[noun] # Increment counter counter += 1 logger.info('Pickling noun to vector dictionary') # Pickle dictionary with open(path.join(out_loc, 'noun_to_vect_dict_' + n_nouns + '.pkl'), 'w') as f: pickle.dump(noun_to_vect_dict, f)
def get_model(): ''' lazy initialization for w2v model so it works in pool ''' global model if model == None: print 'loading the w2v model...' model = Word2Vec.load('w2v/lemma_stopwords') return model
def write(self, model_path, dim): model = W.load(model_path) words = model.vocab.keys() conn = self.conn cur = self.cur create_q = "CREATE TABLE %s (word text," % 'array' for i in range(dim): create_q += "D%d real," % (i) create_q = create_q[:-1]+")" insert_q = "INSERT INTO %s VALUES (" % 'array' for i in range(dim+1):insert_q += "?," insert_q = insert_q[:-1]+")" cur.execute(create_q) conn.commit() inp =[] for idx,word in enumerate(words): inp.append((word,) + tuple(model[word].tolist())) if idx % 10000 == 0: cur.execute(insert_q,inp) conn.commit() del inp inp=[] continue cur.execute("CREATE UNIQUE INDEX idx_word ON %s(word)" % 'array') cur.close() conn.close()
def train(): t0 = time.time() filename = './data/seg20180327.txt' if not os.path.exists(filename): for tsv in ['labeledTrainData.tsv', 'unlabeledTrainData.tsv', 'testData.tsv']: logger.info("loading %s ...." % tsv) load_tsv('./data/' + tsv, filename) sents = word2vec.Text8Corpus(filename) t1 = time.time() logger.info("load text taks %s" % (time.time()-t0)) model_path = './data/model.w2v' if not os.path.exists(model_path): num_features, num_workers = 300, 4 min_word_count, context = 20, 10 downsampling = 1e-3 model = word2vec.Word2Vec( sents, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling ) model.init_sims(replace=True) else: model = Word2Vec.load(model_path) # model.save_word2vec_foramt(output_vec, binary=False) model.build_vocab(sents, update=True) model.train(sents, total_examples=model.corpus_count, epochs=model.iter) # 生成的词典 # model.vocab logger.info('w2v train taks %s' % (time.time()-t1)) model.save('./data/model.w2v')
def predict(algorithm='rf'): train = get_reviews('data/imdb/train_data.csv') if not os.path.exists(model_name): #unlabeled_train = get_reviews('data/unlabeledTrainData.tsv') sentences = get_sentences(train['review'])# + get_sentences(unlabeled_train['review']) train_word2vec(sentences) model = Word2Vec.load(model_name) clean_train_reviews = get_clean_reviews(train['review']) train_features = get_features(clean_train_reviews, model, num_features, with_idf=False) classifier = train_classifier(algorithm, train_features, train) # Free memory ! del train del clean_train_reviews del train_features test = get_reviews('data/imdb/test_data.csv') clean_test_reviews = get_clean_reviews(test['review']) test_features = get_features(clean_test_reviews, model, num_features) evaluate(test_features, test, classifier)
def getAllFeatures(train, mapper): print "this is getAllFeatures" # every record has a cluster value calculated by lda w2c_f, w2c_w = 10, 14 lda_dict_1 = util.read_dict(util.features_prefix + 'id_lda_256.pkl') lda_dict_2 = util.read_dict(util.features_prefix + 'id_lda_512.pkl') k_mean_dict_1 = util.read_dict(util.features_prefix + 'c_k_all_64.pkl') k_mean_dict_2 = util.read_dict(util.features_prefix + 'c_k_all_128.pkl') sentence_dict_path = util.txt_prefix + 'id_sentences.pkl' word2vec_path = util.txt_prefix + str(w2c_f) + 'features_1minwords_' + str(w2c_w) + 'context.pkl' sentence_dic = util.read_dict(sentence_dict_path) model = Word2Vec.load(word2vec_path) train_X = train[features] train_X = mapper.transform(train_X) # .values new_train_X = [] for i in xrange(len(train_X)): id = train_X[i][0] lda_1 = lda_dict_1[id] lda_2 = lda_dict_2[id] s = sentence_dic.get(id) f = np.concatenate(([train_X[i][1:].astype(np.float32)], [sentence_to_matrix_vec(s, model, w2c_f, k_mean_dict_1, k_mean_dict_2)]), axis=1)[0] f = np.concatenate(([f], [[lda_1, lda_2]]), axis=1)[0] new_train_X.append(f) new_train_X = np.array(new_train_X) return new_train_X
def __init__( self, train_data=None, word2vec_model_file_path=None, word_embedding_length=None, full_mode=True, remove_stopword=True, sentence_padding=7, verbose=0, ): """ :param train_data: 训练句子 :type train_data: np.array([]) """ self.__full_mode__ = full_mode self.__remove_stopword__ = remove_stopword self.__verbose__ = verbose self.__sentence_padding__ = sentence_padding self.__word_embedding_length__ = word_embedding_length self.__word2vec_model__ = Word2Vec.load(word2vec_model_file_path % word_embedding_length) if train_data is not None: self.__train_data__ = train_data self.build_encoder(train_data)
def compare(dataset, model_name, pre_model_name): # build model if(os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(dataset, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print model.most_similar(word, topn=10); # load pre-trained google news model logger.info("start loading pre-trained dataset") start_time = timeit.default_timer() pre_model = Word2Vec.load_word2vec_format(pre_model_name, binary=True) logger.info("pre-trained dataset loaded in %.4lfs", timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print pre_model.most_similar(word, topn=10);
def fit_scaler(data_dir, word2vec_model=WORD2VEC_MODELPATH, batch_size=1024, persist_to_path=SCALER_PATH): if type(word2vec_model) == str: word2vec_model = Word2Vec.load(word2vec_model) doc_generator = get_documents(data_dir) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in xrange(batch_size): try: batch.append(doc_generator.next()) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print "Fitted to {} vectors".format(matrix.shape[0]) scaler.partial_fit(matrix) if persist_to_path: save_to_disk(persist_to_path, scaler) return scaler
def main(): """ main function to make prediction use random forest :return: """ train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("/path/testData.tsv", header=0, delimiter="\t", quoting=3) modelName = "/path/Word2VectforNLPTraining" model = Word2Vec.load(modelName) print("Processing training data...") cleaned_training_data = processData.clean_data(train) trainingDataFV = getAvgFeatureVecs(cleaned_training_data,model) print("Processing test data...") cleaned_test_data = processData.clean_data(test) testDataFV = getAvgFeatureVecs(cleaned_test_data,model) n_estimators = 100 result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Word2Vec_AvgVecPredict.csv", index=False, quoting=3)
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ if type(word2vec_model) == str: word2vec_model = Word2Vec.load(word2vec_model) doc_generator = get_documents(data_dir) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in range(batch_size): try: batch.append(six.next(doc_generator)) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model: vectors.append(word2vec_model[word]) matrix = np.array(vectors) print("Fitted to {} vectors".format(matrix.shape[0])) scaler.partial_fit(matrix) if persist_to_path: save_to_disk(persist_to_path, scaler) return scaler
def refine_senses(model, words): """ Determine a more accurate number of senses for each word based on the most_similar senses of each sense of each word in the model """ model = Word2Vec.load(model) gold_word_senses = {} for w in words: senses = get_senses(model, w) if len(senses) > 1: sense_overlaps = find_overlaps(senses) if sense_overlaps: gold_word_senses[w] = determine_num_senses(sense_overlaps) #print gold_word_senses[w], w else: gold_word_senses[w] = add_senses(senses) #print gold_word_senses[w], w elif len(senses) == 1: gold_word_senses[w] = add_senses(senses) #print gold_word_senses[w], w else: gold_word_senses[w] = 2 with open('/Users/adamberger/Desktop/CLMasters/Word_Representation_WSD/gold_word_senses.txt', 'w') as f: for word in gold_word_senses: f.write(word + ' ' + str(gold_word_senses[word]) + '\n') return gold_word_senses
def initialize(fword, tword, modelfn, start, debug): juman = Juman() # parse and check from_word ms_f = juman.analysis(fword).mrph_list() if len(ms_f) > 1: print(u'{} is parsed multiple words'.format(fword)) exit(1) wm_f = ms_f[0] if not wm_f.repname: print(u'no repname with {}'.format(fword)) exit(1) fword = wm_f.repname # parse and check to_word ms_t = juman.analysis(tword).mrph_list() if len(ms_t) > 1: print(u'{} is parsed multiple words'.format(tword)) exit(1) wm_t = ms_t[0] if not wm_t.repname: print(u'no repname with {}'.format(tword)) exit(1) tword = wm_t.repname # load and check model print(u'loading model...') if modelfn.split('.')[-1] == 'model': model = Word2Vec.load(modelfn) elif modelfn.split('.')[-1] == 'bin': model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore') if fword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(fword)) exit(1) elif tword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(tword)) exit(1) model.save('hs0.100m.500.5.18mgt100.model') t1 = time.clock() - start if debug: printtime(t1) print(u'constructing id2vocab map...') id2vocab = {} for i, v in enumerate(model.vocab): id2vocab[i] = v t2 = time.clock() - t1 if debug: printtime(t2) print(u'constructing V...') V = [] for v in model.vocab: V.append(model[v]) V = np.vstack(V) t3 = time.clock() - t2 if debug: printtime(t3) return fword, tword, model, V, id2vocab, t3
def main(): mymodel = Word2Vec.load('./model/model1_2_5') freq = freqWord(4,0.01,stopwordsFile='../config/stopWords1.txt') data = get_array_data('../data/news_lines_splited.txt')[:5000] num=0 for line in data: num+=len(line) print(num)
def most_similar(file,word,num): '''取得单个词的相似词语''' file=os.path.join(os.getcwd(),file) if not os.path.exists(file): raise gg.NameError('训练好的文本不存在') models=Word2Vec.load(file) list1=models.most_similar(str(word),topn=int(num)) return str(list1)
def show_tsne(): model = Word2Vec.load(model_name) embeddings = np.zeros((len(model.index2word), num_features), dtype="float32") for i, word in enumerate(model.index2word): if (i+1) % 1000 == 0: print('Embeddings {}'.format(i+1)) embeddings[i,:] = model[word] plot_tsne(model.index2word, embeddings)
def test_vector(n=0, mincount=1): sbcs = texeval_corpus.test_subcorpora sbc = sbcs[n] fname = 'WIKI_'+'.10epochs.singletok.min'+str(mincount)+'.deep' model = Word2Vec.load(fname) for termid, term in texeval_corpus.terms('test', sbc): if len(term.split()) == 1 and term in model: print termid, term, model[term]
# import modules & set up logging import gensim, logging from fileObject import FileObj from gensim.models import Word2Vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': file_obj = FileObj(r"testSet/data") sentences = file_obj.read_lines_1_words() #model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4) #model.save('w2v_model') model = Word2Vec.load('w2v_model') print(model.most_similar(['怀孕'])) print(model.similarity('怀孕', '孕妇'))
from tries_Harshil import Trie from tries_Harshil import TrieNode import autocomplete2 from autocomplete2 import helpers import json from gensim.models import Word2Vec app = Flask(__name__) huge_file = "/Users/harshitg/github/autocomplete/autocomplete/everything_combined_processed.txt" huge_list = [] with open(huge_file, "r") as f: for line in f: huge_list.extend(line.split()) keys = list(helpers.chunks(huge_list, 2)) # keys to form the trie structure. new_list = [] model = Word2Vec.load( "/Users/harshitg/github/autocomplete/autocomplete/bigrams_fasttext_processed_cb.model" ) model_vectors = model.wv for i in range(0, len(keys)): curr_list = keys[i] new_list.append(curr_list[0] + ' ' + curr_list[1]) trie = Trie() trie.formTrie(new_list) @app.route('/', methods=['GET', 'POST']) def print_suggestions(): if request.method == 'POST': auto_suggestions = trie.printAutoSuggestions( request.get_json().get('item'), new_list, 10) if auto_suggestions == 0 or auto_suggestions == -1:
# give each word an index def toindex(words): data = [] for word in words: try: data.append(word_index[word]) except: continue return data if __name__ == '__main__': # import trained words vector model = Word2Vec.load("skipgram.model") wordvectors = model.wv vocab_list = [word for word, Vocab in wordvectors.vocab.items()] word_index = {" ": 0} word_vector = {} embedding_dim = model.vector_size embeddings_matrix = np.zeros((len(vocab_list) + 1, embedding_dim)) for i in range(len(vocab_list)): word = vocab_list[i] word_index[word] = i + 1 word_vector[word] = wordvectors[word] embeddings_matrix[i + 1] = wordvectors[word] cvscores = []
embeddings_path=embeddings_path, vocab_path=vocab_path, min_count=W2V_MINCOUNT, size=EMBEDDING_SIZE, sg=1, batch_words=W2V_BATCHWORDS, iter=W2V_ITERS, workers=multiprocessing.cpu_count()) else: print("Loading embeddings...\n") # load vocab todo: does this work??? vocab, _ = load_vocab(vocab_path) # load embedding model todo: does this work??? model = Word2Vec.load(embeddings_path) # load the data. print("Loading data...\n") f_sents = codecs.open(sents_file, 'rb', encoding='utf8') f_classes = codecs.open(label_file, 'rb', encoding='utf8') sents = [sent.strip() for sent in f_sents.readlines()] labels = [label.strip() for label in f_classes.readlines()] # number of labels num_labels = len(set(labels)) # fit vectorizers print("Fitting tokenizer...\n") # get count vectors
# emotion lists emotion_dict = { 6: sorted(['love', 'anger', 'surprise', 'joy', 'sadness', 'fear']), 8: sorted([ 'amusement', 'awe', 'contentment', 'excitement', 'anger', 'disgust', 'fear', 'sadness' ]) } # load word2vec model print("Adobe Word2Vec loading") model_folder = "/nfs/bigfovea/add_disk0/eugenia/Emotion/wordembedding_models/" model_file = "w2v_adobe.model" model = Word2Vec.load(os.path.join(model_folder, model_file)) for num in [6, 8]: for method in ['tag-emo', 'all_tags-emo']: print("--> Baselines %s %s-class:" % (method, num)) data_set = [] data_counts = {} k = 0 for img_path in glob.glob(img_folder + "*.jpg"): k += 1 if k % 1000 == 0: print(k // 1000, "K images processed") # save the data_set save2pickle( os.path.join(dataset_dir, '%s_%s.pkl' % (method, num)), data_set)
import keras from keras.models import Model from keras import backend as K from gensim.models import Word2Vec import W2VProcessing as processing import pytextvec as pytextvec model = keras.models.load_model('data/model.h5') print("Loading Gensim Model...") word_vectors = Word2Vec.load('data/word2vec/500features_10minwords_10context') maxlen = 10 X = processing.loadXTrain( '/home/quelibrio/Work/Bevrage/BevBox/receiptcomprehension/data/SamplePrintScans/bevmo_sample_receipt_01.txt' ) x_test = processing.comments2Matrix(X, word_vectors, maxlen) print(x_test) prediction = model.predict(x=x_test, batch_size=1000) print(prediction) #fpr_keras, tpr_keras, thresholds_keras = roc_curve(ytest.argmax(),prediction.argmax()) #from sklearn.metrics import auc #auc_keras = auc(fpr_keras, tpr_keras) #import tensorflow as tf #prediction1=tf.argmax(logits,1) #print(prediction[0]) #print(prediction)
predicted = model.predict(data)[0] predictedY = predicted.argmax(axis=-1) return predictedY if __name__ == "__main__": filterSizes = [3, 4, 5] numOfFilters = 100 # tested with 10, 20 dropout = 0.5 batchSize = 1000 epochs = 20 sequenceLength = 20 # Twitter max length is 140 chars embeddingDim = 50 numOfLabels = 5 drop = 0.5 wvModel = Word2Vec.load('vectors.bin') # sentencesTrain, emojisTrain = obtainData() # dataTrain, labelsTrain, wordIdTrain = obtainData() # dataTest, labelsTest, wordIdTest = obtainData("test") dataTrain, dataTest, labelsTrain, labelsTest, wordIdMap, maxLength, idEmojiMap = buildDataFull() packedData = {"len": maxLength, "dic": wordIdMap, "emo": idEmojiMap} js = json.dumps(packedData) fp = open("datacnn.json", "w") fp.write(js) fp.close() embeddingMatrix = np.zeros((len(wordIdMap)+1, embeddingDim)) for word, i in wordIdMap.items(): try: vector = wvModel.wv[word] embeddingMatrix[i] = vector
['and', 'the', 'final', 'sentence']] # train model model = Word2Vec(sentences, min_count=1) # summarize the loaded model print(model) # summarize vocabulary words = list(model.wv.vocab) print(words) # acces to the vector associated to "sentence" print(model['sentence']) # save model model.save('model.bin') # load model new_model = Word2Vec.load('model.bin') print(new_model) # fit a 2d PCA model to the vectors X = [] for i in words: X.append(model[i]) pca = PCA(n_components=2) result = pca.fit_transform(X) # create a scatter plot of the projection pyplot.scatter(result[:, 0], result[:, 1]) words = list(model.wv.vocab) for i, word in enumerate(words): pyplot.annotate(word, xy=(result[i, 0], result[i, 1])) pyplot.show()
from gensim.test.utils import get_tmpfile from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence, PathLineSentences model = Word2Vec.load("word2vec.model") score, predictions = model.wv.evaluate_word_analogies( './data/questions-words.txt') print(score) model = KeyedVectors.load_word2vec_format( "./data/GoogleNews-vectorsnegative300.bin", binary=True, limit=60000) score, predictions = model.evaluate_word_analogies( './data/questions-words.txt') print(score)
save_id = args.save_id os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu print('' * 100) print('Runing test ' + test) n_batch = n_steps * n_cpu n_updates = n_iters // n_batch desc = os.path.join( "QuickDraw/embeddings/embeddings/", "act2vec_dataset:" + dataset + "_wordlen:" + str(stroke_length) + "_dim:" + str(embedding_dim) + "_win:" + str(window)) act2vec_model = Word2Vec.load(desc + ".model") print('Found {} words in model'.format(len(list(act2vec_model.wv.vocab)))) env = SubprocVecEnv([ lambda: PainterGym(width=256, height=256, embd_dim=embedding_dim, action_translator=action_translator, square_size=args.square_size, act2vec_model=act2vec_model, action_buff_size=1, action_space='discrete', emb_type=test) for i in range(n_cpu) ])
sentence = r.sub('', str(item)) seg_list = tokenizer(sentence) to_csv_content.append(seg_list) target_word = [ '芯片', '紫光', '行业', '农业', '券商', '电动车', '电池', '服务器', '军工', '白酒', '医药', '健康', '医疗', '水泥', '上涨', '北京' ] target_ner = [ '紫光国微', '五粮液', '中兵红箭', '宁德时代', '三一重工', '东方航空', '恒瑞医药', '山东药玻', '太极实业', '中船防务', '中国平安', '招商轮船', '中兴通讯', '浪潮信息', '东华软件', '东山精密', '旋极信息' ] #############寻找最相似词汇###################### text = data['content'][12] model = Word2Vec.load("D:/nlp_learning/sinavoacb.model") r = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+——!;「」》::“”·‘’《,。?、~@#¥%……&*()()]+") #########停用词############## def stopwordslist(filepath): stopwords = [ line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines() ] return stopwords stopwords = stopwordslist('D:/nlp_learning/停用词表/characters-master/stop_words') score_list = []
def load_model(iter): model = Word2Vec.load(output_modelPath + str(iter) + 'model.bin') return model
test_tokens = test['tokens'] test_counts = test['counts'] args.num_docs_test = len(test_tokens) test_1_tokens = test['tokens_1'] test_1_counts = test['counts_1'] args.num_docs_test_1 = len(test_1_tokens) test_2_tokens = test['tokens_2'] test_2_counts = test['counts_2'] args.num_docs_test_2 = len(test_2_tokens) emb_type = 'none' #bert embeddings = None if (emb_type == 'w2v'): from gensim.models import Word2Vec model = Word2Vec.load("/content/word2vec/w2v_10eps_model.model") vectors = model.wv print('loaded') elif (emb_type == 'bert'): from sentence_transformers import SentenceTransformer from sentence_transformers import models, losses import scipy.spatial import pickle as pkl word_embedding_model = models.BERT("/content/models") # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
def w2v_pad(df_train, df_test, col, maxlen_, victor_size): tokenizer = text.Tokenizer(num_words=args.num_words, lower=False, filters="") tokenizer.fit_on_texts( list(df_train[col].values) + list(df_test[col].values)) train_ = sequence.pad_sequences(tokenizer.texts_to_sequences( df_train[col].values), maxlen=maxlen_) test_ = sequence.pad_sequences(tokenizer.texts_to_sequences( df_test[col].values), maxlen=maxlen_) word_index = tokenizer.word_index count = 0 nb_words = len(word_index) print(nb_words) all_data = pd.concat([df_train[col], df_test[col]]) file_name = '../embedding/' + 'Word2Vec_' + col + "_" + str( victor_size) + '.model' if not os.path.exists(file_name): model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values], size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2) model.save(file_name) else: model = Word2Vec.load(file_name) print("add word2vec finished....") glove_model = {} with open("../embedding/glove_vectors_word.txt", encoding='utf8') as f: for line in f: values = line.rstrip().rsplit(' ') word = values[0] coefs = np.asarray(values[1:], dtype='float32') glove_model[word] = coefs print("add glove finished....") embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size)) for word, i in word_index.items(): embedding_vector = model[word] if word in model else None if embedding_vector is not None: count += 1 embedding_word2vec_matrix[i] = embedding_vector else: unk_vec = np.random.random(victor_size) * 0.5 unk_vec = unk_vec - unk_vec.mean() embedding_word2vec_matrix[i] = unk_vec glove_count = 0 embedding_glove_matrix = np.zeros((nb_words + 1, victor_size)) for word, i in word_index.items(): embedding_glove_vector = glove_model[ word] if word in glove_model else None if embedding_glove_vector is not None: glove_count += 1 embedding_glove_matrix[i] = embedding_glove_vector else: unk_vec = np.random.random(victor_size) * 0.5 unk_vec = unk_vec - unk_vec.mean() embedding_glove_matrix[i] = unk_vec embedding_matrix = np.concatenate( (embedding_word2vec_matrix, embedding_glove_matrix), axis=1) print(embedding_matrix.shape, train_.shape, test_.shape, count * 1.0 / embedding_matrix.shape[0], glove_count * 1.0 / embedding_matrix.shape[0]) return train_, test_, word_index, embedding_matrix
from gensim.models import Word2Vec # test model print('loading model...') model = Word2Vec.load("assets/gay_seattle.w2v") print("seattle", model.wv.most_similar('seattle', topn=50)) print(model.wv.distances('seattle', ('news', 'june', 'times', 'march'))) # seattle [('news', 0.9989323616027832), ('june', 0.998815655708313), ('times', 0.9987982511520386), ('march', 0.9987823963165283), ('apr', 0.9987049102783203), ('july', 0.9985809326171875), ('nov', 0.9984444379806519), # print("model details: ", model) # print('similar words to seattle:') # print("capitol", model.wv.most_similar('capitol')) # # print("gay", model.wv.most_similar('gay', topn=50)) # print(model.wv.most_similar('lesbian')) # print(model.wv.most_similar('considered')) # print(model.wv.most_similar('number')) # print("=================") # print(model.wv.distances('seattle', ('gay', 'renton', 'lesbian', 'rain'))) # print(model.wv.distance('seattle', 'civil')) # print(model.wv.distance('seattle', 'lesbian')) # print(model.wv.rank('seattle', 'gay')) # print(model.wv.rank('seattle', 'lesbian')) # print(model.wv.distances('seattle'))
model_num = 0 # Какую из моделей использовать similar_qty = 10 # Количество похожих товаров models_list = [ 'models/word2vec/w2v_mymodel_33_min50_sg0_i220_window5_size300', # 'models/word2vec/w2v_mymodel_33_min1000_sg0_i200_window5_size300', # ------------ 'models/word2vec/w2v_mymodel_33_min5_sg0_i250_window3_size300_transSplit', # 'models/word2vec/w2v_mymodel_33_min1000_sg0_i200_window5_size300', # ------------ 'models/word2vec/w2v_mymodel_33_mincount1_min1_sg0_i230_window5_size300', # # 'models/word2vec/w2v_mymodel_33_mincount1_min1_sg0_i400_window10_size300', # 'models/word2vec/test_29 of 81 nodes_250 alpha_ 0.0025 epochs_ 100 windows_ 4 time_0_02_20_155266' ] # Общие требования: sg1 - не использовать # Скорость обучения и количество эпох (количество) - и скорость 0,005 пока лучший результат 220 достаточно bdd_rms = pd.read_excel('library/BDD.xlsx', names=[ 'product', 'department', 'model_adeo', 'model_name', 'date_created', 'product_name', 'is_stm' ]) bdd_rms['product'] = bdd_rms['product'].astype(str) model = Word2Vec.load(models_list[model_num]) #%%
import os import time import gc import random from keras.preprocessing import text, sequence import torch from torch import nn from torch.utils import data from torch.nn import functional as F from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from gensim.models import KeyedVectors, word2vec, Word2Vec import pickle, json wv_from_bin = pickle.load(open("GloVe_50.pkl", 'rb')) ###GLOVE wv_from_scratch = Word2Vec.load('word2vec.model') ##word2vec from scratch wordVectors = np.load( "/home/wzh/wzh/glove/wordVectors.npy") ##word2vec delta training tokens = json.load( open("/home/wzh/wzh/glove/tokens.json")) ##word2vec delta training cate2id = json.load(open("label.json", "rb")) NUM_LABLES = len(cate2id) NUM_MODELS = 1 LSTM_UNITS = 128 DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS MAX_LEN = 40 BATCH_SIZE = 32 EPOCH = 20 MODE = "delta_word2vec" ###GLOVE, word2vec, delta_word2vec DATASET = "embed_eval_data_sample_general.csv"
scores = {} for word in test_words: if word not in positive + negative: test_word = unitvec(np.array(model[word])) # Cosine Similarity scores[word] = np.dot(test_word, mean) print(sorted(scores, key=scores.get, reverse=True)[:10]) #1 ## js from gensim.models import Word2Vec model = Word2Vec.load('wiki.en.word2vec.model') positive_words = ["", ""] negative_words = [""] # Test Word2vec print("Testing Word2vec") #model = word2vec.getModel() test(model, positive_words, negative_words, model.wv.vocab) # # Test Fasttext # print("Testing Fasttext") # model = fasttxt.getModel() # test(model,positive_words,negative_words,model.words)
def query_model(word): results = list() most_similar = list() global model global model_path global num_of_words global stopwords if list(word)[0].isupper(): firstupper = True else: firstupper = False if model == None: # if model is not initialised print("Loading Natural Language Processing Model...") try: start = time() model = w2v.load(model_path) print("Model Loaded Successfully! Took " + str(round(time()-start,2)) + " seconds.") except FileNotFoundError: print("Error: Model does not exist in path. Exiting...") exit() except ValueError: print("Path is not a valid model. Perhaps it is corrupt? Exiting...") exit() word = word.lower() # convert word to lowercase s = remove_nums_after_s(word) s = remove_nums_before_s(s) if s not in stopwords: # only queries the model if word is not a stopword try: print("Querying model for: " + s) most_similar = model.wv.most_similar(s, topn=num_of_words) except KeyError: word_found = False for ret_word in remove_substitution(word,removeall=True): try: most_similar = model.wv.most_similar(ret_word,topn=num_of_words) word_found = True # becomes true if exception does not occur print("Word stripped to: " + ret_word) break except KeyError: try: ret_word = remove_nums_after_s(ret_word) ret_word = remove_nums_before_s(ret_word) most_similar = model.wv.most_similar(ret_word,topn=num_of_words) word_found = True print("Word stripped to: " + ret_word) break except KeyError: continue if word_found == False: print(word + " - Could not find result for this word in NLP model. This word will be skipped") for item in most_similar: if item[1] > 0.5: # if likeness > 0.7 if firstupper: results.append(capitalise_first_char(item[0])) else: results.append(item[0]) return results
from gensim.models import Word2Vec import sys model_name = sys.argv[1] model = Word2Vec.load(model_name) # Get wordvectors for all words in vocabulary. word_vectors = model.wv.syn0 print(model) print(word_vectors.shape) words = list(model.wv.vocab) print(words) print(model['yellow']) # https://machinelearningmastery.com/develop-word-embeddings-python-gensim/ # model.wv['word'] #sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'], # ['this', 'is', 'the', 'second', 'sentence'], # ['yet', 'another', 'sentence'], # ['one', 'more', 'sentence'], # ['and', 'the', 'final', 'sentence']] # train model #model = Word2Vec(sentences, min_count=1) # summarize the loaded model # print(model) # summarize vocabulary # words = list(model.wv.vocab)
subjects_eng = [ 'location_traffic_convenience', 'location_distance_from_business_district', 'location_easy_to_find', 'service_wait_time', 'service_waiters_attitude', 'service_parking_convenience', 'service_serving_speed', 'price_level', 'price_cost_effective', 'price_discount', 'environment_decoration', 'environment_noise', 'environment_space', 'environment_cleaness', 'dish_portion', 'dish_taste', 'dish_look', 'dish_recommendation', 'others_overall_experience', 'others_willing_to_consume_again' ] subjects_dict = OrderedDict(zip(subjects_eng, subjects)) config.max_aspect_len = len(subjects[0].split(' ')) print('building word vector...') print('构造 word2id 映射') config.w2v_path, config.w2v_word2id_txt = '../data/all_content_no_punc_100_8_mc2_fnl.w2v', 'word2id_map_mc2_fnl' w2v = Word2Vec.load(config.w2v_path) print(len(w2v.wv.vocab)) word2id, max_context_len, max_aspect_len = get_word2id( '../data/', subjects, 'ai_challenger_sentiment_analysis_trainingset_20180816/sentiment_analysis_trainingset', 'ai_challenger_sentiment_analysis_validationset_20180816/sentiment_analysis_validationset', 'ai_challenger_sentiment_analysis_testa_20180816/sentiment_analysis_testa', w2v, pre_processed=False, save_fname=config.w2v_word2id_txt, suffix='_cut_word_rst.txt') print(len(word2id), max_context_len, max_aspect_len) config.max_context_len, config.max_aspect_len = max_context_len, max_aspect_len print('对评论编码')
word1, word2 = h.split("-", 1) h2 = word1 + "_" + word2 else: h2 = h res[h] = retrieve_vector(sem_model, h2) return res ###################################################################### ############################ Main script ############################# ###################################################################### spanish = False if spanish: new_model = Word2Vec.load("spanish_word2vec.model") train_hypos = "SemEval2018-Task9/training/data/1C.spanish.training.data.txt" train_hypers = "SemEval2018-Task9/training/gold/1C.spanish.training.gold.txt" test_hypos = "SemEval2018-Task9/test/data/1C.spanish.test.data.txt" test_hypers = "SemEval2018-Task9/test/gold/1C.spanish.test.gold.txt" output_file = "SemEval2018-Task9/output_spanish.txt" else: new_model = Word2Vec.load("english_word2vec.model") train_hypos = "SemEval2018-Task9/training/data/1A.english.training.data.txt" train_hypers = "SemEval2018-Task9/training/gold/1A.english.training.gold.txt" test_hypos = "SemEval2018-Task9/test/data/1A.english.test.data.txt" test_hypers = "SemEval2018-Task9/test/gold/1A.english.test.gold.txt" output_file = "SemEval2018-Task9/output.txt" hypos_hypers_train = get_hypos_hypers(train_hypos, train_hypers) hypos_train = hypos_hypers_train.keys()
def get_w2v_model(self): # 把之前訓練好的 word to vec 模型讀進來 self.embedding = Word2Vec.load(self.w2v_path) self.embedding_dim = self.embedding.vector_size
import re from gensim.models.phrases import Phrases, Phraser import pandas as pd from gensim.models import Word2Vec import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid") from sklearn.decomposition import PCA from sklearn.manifold import TSNE #Load model and start predicting. model = Word2Vec.load("word2vec_TDS_14.model") def tsnescatterplot(model, word, list_names): """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, its list of most similar words, and a list of words. """ arrays = np.empty((0, 300), dtype='f') word_labels = [word] color_list = ['red'] # adds the vector of the query word arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) # gets list of most similar words close_words = model.wv.most_similar([word])
summary = summary + sentences[sent_id[0]] + '\n<br>' sent_count += 1 # intermediate2 = updateScores(sent_id[0]) else: sent_count += 1 print(len(summary.split())) print(sent_count) print(len(sentences) - 1) print('============================') return [summary, len(input_str.split()), len(summary.split())] app = Flask(__name__) #create the Flask app model = Word2Vec.load('resources/w2v_300cleaned_phrases+word2vec.bin') @app.route('/summarize', methods=['POST']) def query_example(): if request.method == 'POST': if request.form.get('method') == 1: res = summarize_base(request.form.get('input'), request.form.get('len')) else: res = summarize_advanced(request.form.get('input'), request.form.get('len')) return jsonify({ 'summary': res[0], 'input_count': res[1], 'summary_count': res[2]
import jieba_fast as jieba from gensim.models import Word2Vec import re, os import codecs import editdistance import warnings warnings.filterwarnings("ignore") # 忽略keras带来的满屏警告 mode = 0 char_size = 128 maxlen = 256 min_count = 16 word2vec = Word2Vec.load('../word2vec_baike/word2vec_baike') id2word = {i+1:j for i,j in enumerate(word2vec.wv.index2word)} word2id = {j:i for i,j in id2word.items()} word2vec = word2vec.wv.syn0 word_size = word2vec.shape[1] word2vec = np.concatenate([np.zeros((1, word_size)), word2vec]) for w in word2id: if w not in jieba.dt.FREQ: jieba.add_word(w) def tokenize(s): return jieba.lcut(s, HMM=False)
def __init__(self, path_to_word2vec_model): self.model = Word2Vec.load(path_to_word2vec_model)
import numpy as np from gensim.models import Word2Vec import _pickle as cPickle np.random.seed(123) #item_emb contains neural embeddings of all items with open('./item_embed', 'rb') as f: item_list=cPickle.load(f) new_list=np.load('tot_x_seq1.npy') model = Word2Vec.load('word2vec_model') y_tot=np.load('tot_y1.npy') # Appending product vectors with their neural embeddings def w2v_data_ext(new_list): w2v_data=[] for i in range(0,len(new_list)): seq_vec=[] for j in range(0,len(new_list[i])): q = np.concatenate([model.wv[new_list[i][j]], item_list[new_list[i][j]]]) if len(q)==82: seq_vec.append(q) if len(seq_vec)==5: w2v_data.append(seq_vec) return np.asarray(w2v_data) # Train and test split def train_test_split(w2v_data,y_tot): train_x=w2v_data[0:69349] test_x=w2v_data[69349:]
from gensim.models import Word2Vec model = Word2Vec.load("./model/predictNewsTitle.model") print(model["苹果"]) print(model.similarity('范冰冰', '李晨'))
def load_from(self, file): self.model = Word2Vec.load(file) self.model_initialized = True