def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters( "corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([ ('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3, 20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def main(): print('== PREDICTIONS ==') model = word2vec.load('./text8.bin') print("model.vocab") print(model.vocab) print("model.vectors.shape") print(model.vectors.shape) print("model.vectors") print(model.vectors) print("model['dog'].shape") print(model['dog'].shape) print("model['dog'][:10]") print(model['dog'][:10]) print("model.distance('dog', 'cat', 'fish')") print(model.distance('dog', 'cat', 'fish')) print('== SIMILARITY ==') indexes, metrics = model.similar('dog') print('indexes') print(indexes) print('metrics') print(metrics) print("model.vocab[indexes]") print(model.vocab[indexes]) print("model.generate_response(indexes, metrics)") print(model.generate_response(indexes, metrics)) print("model.generate_response(indexes, metrics).tolist()") print(model.generate_response(indexes, metrics).tolist()) print('== PHRASES ==') indexes, metrics = model.similar('los_angeles') print('model.generate_response(indexes, metrics).tolist()') print(model.generate_response(indexes, metrics).tolist()) print('== ANALOGIES ==') indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man']) print('indexes') print(indexes) print('metrics') print(metrics) print('model.generate_response(indexes, metrics).tolist()') print(model.generate_response(indexes, metrics).tolist()) print('== CLUSTERS ==') clusters = word2vec.load_clusters('./text8-clusters.txt') print('clusters.vocab') print(clusters.vocab) print('clusters.get_words_on_cluster(90).shape') print(clusters.get_words_on_cluster(90).shape) print('clusters.get_words_on_cluster(90).shape[:10]') print(clusters.get_words_on_cluster(90)[:10]) model.clusters = clusters indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france']) print('model.generate_response(indexes, metrics).tolist()') print(model.generate_response(indexes, metrics).tolist())
def testWord2Cluter(): """Cluster""" # Do the clustering of the vectors based on the trained model. # That created a text8-clusters.txt with the cluster for every word in the vocabulary word2vec.word2clusters('/D/test/text8/text8', '/D/test/text8/text8-clusters', 100, verbose=True) clusters = word2vec.load_clusters('/Users/drodriguez/Downloads/text8-clusters') print clusters['dog'] print clusters.get_words_on_cluster(90).shape print clusters.get_words_on_cluster(90)[:10]
def create_model(): in_file = open(sys.argv[1]) out_file = open(sys.argv[2],"w") json_data = json.load(in_file) final_hash = {} model = word2vec.load(sys.argv[3]) clusters = word2vec.load_clusters(sys.argv[4]) for loc in json_data: count = 0 keywords = [] final_hash[loc] = {} final_hash[loc]["doc_length"] = json_data[loc]["len"] final_hash[loc]["keywords"] = [] final_hash[loc]["centroids"] = [] word_vectors = {} #"word" => [vector] word_clusters = {} #"cluster_no" => [words] cluster_centroids = {} for word in json_data[loc]["keywords"]: if len(word.split()) > 1: continue count += 1 try: vec = model[word] cluster_no = clusters[word] except KeyError: #print("No entry in word2vec for " + word) continue word_vectors[word] = vec if cluster_no not in word_clusters: word_clusters[cluster_no] = [] cluster_centroids[cluster_no] = len(vec)*[0.0] word_clusters[cluster_no].append(word) for i in range(len(vec)): cluster_centroids[cluster_no][i] += word_vectors[word][i] for cluster_no in word_clusters: cluster_len = len(word_clusters[cluster_no]) for i in range(len(cluster_centroids[cluster_no])): cluster_centroids[cluster_no][i] = cluster_centroids[cluster_no][i] / cluster_len for cluster_no in word_clusters: keys = [] for word in word_clusters[cluster_no]: keys.append((word,json_data[loc]["keywords"][word])) final_hash[loc]["keywords"].append(keys) final_hash[loc]["centroids"].append(cluster_centroids[cluster_no]) #print(" Total keywords in " + loc + " : " + str(count)) #print(" Total word vectors in " + loc + " : " + str(len(word_vectors))) json.dump(final_hash,out_file)
def test_model_with_clusters(): clusters = word2vec.load_clusters(output_clusters) model = word2vec.load(output_txt) assert clusters.vocab.shape == model.vocab.shape model.clusters = clusters indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30) assert indexes.shape == (30, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 3
def test_model_with_clusters(): clusters = word2vec.load_clusters(output_clusters) model = word2vec.load(output_bin) assert clusters.vocab.shape == model.vocab.shape model.clusters = clusters indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30) assert indexes.shape == (30,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 3
def main(): """Main method.""" k = 35 # write ground truth vocabulary to gt_input.txt and get ground truth # dictionary ldict = aggregate_input_and_ground_truths() logging.info("Done generating ldict and ground truth text file.") # if file containing clusters hasn't already been created, create it if not os.path.isfile("./clusters.txt"): preprocess() # train word2vec and cluster output from the full vocab word2vec.word2clusters("./text8-phrases-extra", "./clusters.txt", k, verbose=True, min_count=1) logging.info("Done training.") logging.info("Done creating clusters.") # load clusters clusters = word2vec.load_clusters("./clusters.txt") # build cluster dictionary from full vocabulary cdict = {} for i in range(0, k): for word in clusters.get_words_on_cluster(i): cdict[word] = set([i]) logging.info("Done generating cdict.") # trim cluster dictionary down to only keys included in ground truths trimmed_cdict = {} for key in ldict.keys(): try: trimmed_cdict[key] = cdict[key] except: pass logging.info("done trimming cdict; begining scoring\n") # compute bcubed score precision = bcubed.precision(trimmed_cdict, ldict) recall = bcubed.recall(trimmed_cdict, ldict) fscore = bcubed.fscore(precision, recall) print "precision: {p}, \t recall: {r}, \t fscore: {f}".format(p=precision, r=recall, f=fscore) logging.info("done scoring\n")
def pre_process(model, input_data): sent_arr = [] train_set = [] train_set_x = [] train_set_y = [] fopen = open(input_data) lines = fopen.readlines() fopen.close() unlogin_arr = [] word_arr = [] #cluster model cluster_model = word2vec.load_clusters(cluster_txt) for i in xrange(len(lines)): if lines[i] != '\n': arr = lines[i].split(" ") f = [0] * 15 #label of classes based on word2vec c_vec = [0] * 100 if arr[0] not in pu_arr: word_arr.append(arr[0]) f[1] = 1 if is_enum(arr[0]) + is_cnum(arr[0]) > 0 else 0 f[2], f[3] = (prefix(arr[0]), suffix(arr[0])) try: f[4] = 1 if lines[i - 1].split(' ')[0] in pu_arr else 0 except Exception, e: f[4] = 0 try: f[6] = 1 if lines[i + 1].split(' ')[0] in pu_arr else 0 except Exception, e: f[6] = 0 f[5] = 1 if f[4] + f[6] == 0 else 0 f[7 + word_count(arr[0])] = 1 #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0])) pre_fvec, pre_fvec2, fvec, suf_fvec, suf_fvec2 = (None, None, None, None, None) try: pre_fvec = model[lines[i - 1].split(" ")[0]].tolist() except Exception, e: pre_fvec = [0] * 400 try: fvec = model[arr[0]].tolist() except Exception, e: fvec = [0] * 400 unlogin_arr.append(arr[0])
def pre_process(model, input_data): sent_arr = [] train_set = [] train_set_x = [] train_set_y = [] fopen = open(input_data) lines = fopen.readlines() fopen.close() unlogin_arr = [] word_arr = [] #cluster model cluster_model = word2vec.load_clusters(cluster_txt) for i in xrange(len(lines)): if lines[i]!='\n': arr = lines[i].split(" ") f = [0]*15 #label of classes based on word2vec c_vec = [0]*100 if arr[0] not in pu_arr : word_arr.append(arr[0]) f[1] = 1 if is_enum(arr[0])+is_cnum(arr[0])>0 else 0 f[2],f[3] =(prefix(arr[0]), suffix(arr[0])) try: f[4] = 1 if lines[i-1].split(' ')[0] in pu_arr else 0 except Exception, e: f[4] = 0 try: f[6] = 1 if lines[i+1].split(' ')[0] in pu_arr else 0 except Exception, e: f[6] = 0 f[5] = 1 if f[4]+f[6]==0 else 0 f[7+word_count(arr[0])] = 1 #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0])) pre_fvec,pre_fvec2,fvec,suf_fvec ,suf_fvec2= (None,None,None,None,None) try: pre_fvec = model[lines[i-1].split(" ")[0]].tolist() except Exception, e: pre_fvec = [0]*400 try: fvec = model[arr[0]].tolist() except Exception, e: fvec = [0]*400 unlogin_arr.append(arr[0])
def active_learn_main(engine, initial_term, user_id, concept_id=False): ''' engine is initial_term is user_id is concept_id is ''' #user will select a term and then the term will be run through the word2vec model to come up with similar terms #if it is an existing concept pull the existing data from db else start from scratch if concept_id: term_list = engine.execute(select([ConceptTerms.c.term]).where(ConceptTerms.c.concept_id == concept_id)) term_exc = engine.execute(select([ConceptTerms_reject.c.term]).where(ConceptTerms_reject.c.concept_id == concept_id)) pred_list = engine.execute(select([ConceptPredictors.c.predictor]).where(ConceptPredictors.c.concept_id == concept_id)) pred_exc = engine.execute(select([ConceptPredictorsReject.c.predictor]).where(ConceptPredictorsReject.c.concept_id == concept_id)) else: term_list = set([initial_term]) term_exc = set() pred_list = set() pred_exc = set() #load in model #model = word2vec.load('/groups/clinicaltrials/clinicaltrials/data/criteria.bin') #clusters = word2vec.load_clusters('/groups/clinicaltrials/clinicaltrials/data/criteria-clusters.txt') model = word2vec.load('../data/criteria.bin') clusters = word2vec.load_clusters('../data/criteria-clusters.txt') # add clusters to model model.clusters = clusters #add skip terms to term_exc and pred_exc skip_term, skip_pred = skip_terms() term_exc.update(skip_term) pred_exc.update(skip_pred) term_list, pred_list = run_active_learning(term_list, term_exc, pred_list, pred_exc, engine, concept_id, user_id, model)
def __init__(self, corpus, ptype, test=False, modelname="crfre_classifier"): super(CrfSuiteRE, self).__init__() self.data = [] self.labels = [] self.scores = [] self.predicted = [] self.entities = [] self.pairtype = ptype self.modelname = ptype + "_" + modelname self.gold_relations = set() self.tair_pairs = load_tair_relations() self.vecmodel = word2vec.load("corpora/Thaliana/documents-processed" + '.bin') with codecs.open("seedev_relation.txt", 'r', 'utf-8') as relfile: for r in relfile: self.gold_relations.add(r.strip()) self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") #with codecs.open("corpora/Thaliana/documents-clusters.txt", "r", "utf-8") as clusterfile: # for l in clusterfile: # values = l.strip().split(" ") # self.clusters[values[0]] = values[1] self.generate_data(corpus, self.modelname, ptype, test)
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
from nltk.corpus import stopwords import pandas as pd from bs4 import BeautifulSoup import re import numpy as np from sklearn.ensemble import RandomForestClassifier import word2vec LABELED_PATH = r'/home/apple/data/kaggle/senti_analysis/labeledTrainData.tsv' TEST_PATH = r'/home/apple/data/kaggle/senti_analysis/testData.tsv' BIN_PATH = '/home/apple/data/kaggle/senti_analysis/senti.bin' CLUSTER_PATH = '/home/apple/data/kaggle/senti_analysis/senti-clusters.txt' cluster_num = 100 clusters = word2vec.load_clusters(CLUSTER_PATH) def clean_text(text, remove_stopwords=False): # 1. Remove HTML review_text = BeautifulSoup(text).get_text() # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops]
#word2vec.word2clusters('../data/text8', '../data/test8-cluster.txt', classes=10, size=100, verbose=True) ## predictions model = word2vec.load('../data/text8.bin') model.vocab ## 训练预料的向量表示 model.vectors ## model的向量表示, model.vectors.shape model.clusters ## model的那个cluster模型 ## 1) model.analogy(pos, neg, n=10) ## 分析词+-之后的相似词,比如postive = ['king', 'woman']; negative = ['man'] ## index, metrix = model.analogy(pos, neg, n=10) ## model.generate_response(index, metrix).tolist() ## 2) model.cosine(word, n=10) ## 分析单个词的相似词 ## index, metrix = model.cosine('dog') ## model.generate_response(index, metrix).tolist() ## 3) model['dog'] 等同于 model.get_vector('dog') ## 获取某个词的词向量表示 ## clusters cluster = word2vec.load_clusters('../data/test8-cluster.txt') ## 1) model.get_cluster(word) 等同于 model['word'] ## 找到某个词所在的类 ## cluster_num = cluster.get_cluster('dog') ## 2) model.get_words_on_cluster(num) ## 找到某个类下的词 ## print cluster_num # print cluster.get_words_on_cluster(cluster_num)
import sys if len(sys.argv) > 1 and sys.argv[1] in ['-t', '-train']: # Add new articles to file clean_articles.clean() # Train new model w2v.word2phrase('combined', './text-phrases', verbose=True) w2v.word2vec('text8-phrases', 'text.bin', size=100, verbose=True) w2v.word2clusters('combined', 'text-clusters.txt', 100, verbose=True) # Initialize pre-trained model model_old = w2v.load('text8.bin') model = w2v.load('text.bin') clusters = w2v.load_clusters('text-clusters.txt') model.clusters = clusters #ind = clusters['Trump'] #print(clusters.get_words_on_cluster(ind)) print(len(model_old.vocab)) print(len(model.vocab)) # King - man + woman : "Man is to King as Woman is to # Trump - America + Germany pos = ['Putin', 'America'] neg = ['Russia'] leader = model.analogy(pos, neg) print() print("{0} is to {1} as {2} is to ...".format(neg[0], pos[0], pos[1]))
indexes, metrics = model.cosine('highest') print(model.generate_response(indexes, metrics).tolist()) # lowest, ranking, higher # ******************************************** # Analogies # Its possible to do more complex queries like analogies such as: king - man + woman = queen # This method returns the same as cosine the indexes of the words in the vocab and the metric indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10) # ************************************************************************** # ****************************** now look at the cluster # ********* In cluster file, you can find each word has a number. it presents cluster number clusters = word2vec.load_clusters('text8-clusters.txt') print(clusters.get_words_on_cluster( 90).shape) # 255. It means 255 words in this cluster print(clusters.get_words_on_cluster(90)) # It prints all words of the cluster print(clusters[b'the']) # 49. 'The' is belong to 49 cluster print(clusters.get_words_on_cluster(49)) print(clusters.get_words_on_cluster(49)[:10]) # showing first 10 words # We can add the clusters to the word2vec model and generate a response that includes the clusters model.clusters = clusters indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'], n=10) print(model.generate_response(indexes, metrics).tolist())
def get_model(filename, clusters=False): if clusters: return word2vec.load_clusters(filename) else: return word2vec.load(filename)
#2. Predictions(預測過程): model = word2vec.load('text8.bin') #print model.vocab #print model.vectors.shape #print model.vectors #print model['dog'].shape #print model['dog'][:10] #indexes, metrics = model.cosine('socks') #print indexes, metrics #print model.vocab[indexes] #print model.generate_response(indexes, metrics) #print model.generate_response(indexes, metrics).tolist() #3. Phrases(相位): #indexes, metrics = model.cosine('los_angeles') #print model.generate_response(indexes, metrics).tolist() #4.Analogies(類似): #indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10) #print indexes, metrics #print model.generate_response(indexes, metrics).tolist() #Clusters(分群): clusters = word2vec.load_clusters('text8-clusters.txt') #print clusters['dog'] #print clusters.get_words_on_cluster(90).shape #print clusters.get_words_on_cluster(90)[:10] model.clusters = clusters indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'], n=10) print model.generate_response(indexes, metrics).tolist()
from db_manager import * import word2vec SQL_DB = 'sqlite:///clean_chronicling_america.sqlite' WORDS_FILE = 'docs' PHRASES_FILE = 'docs-phrases' BIN_FILE = 'docs.bin' CLUSTER_FILE = 'docs-clusters.txt' def dump_text(): session = connect(SQL_DB) r = session.execute('select ocr_eng from articles') with open(WORDS_FILE, 'w') as f: for i, doc in enumerate(r): f.write(doc[0] + '\n') if __name__=='__main__': # word2vec.word2phrase(WORDS_FILE, PHRASES_FILE, verbose=True) # word2vec.word2vec(PHRASES_FILE, BIN_FILE, size=100, verbose=True) # word2vec.word2clusters(WORDS_FILE, CLUSTER_FILE, 100, verbose=True) # model = word2vec.load(BIN_FILE) # print(model.vocab) # print(model.vectors.shape) clusters = word2vec.load_clusters(CLUSTER_FILE) print(clusters)
def test_clusters(): clusters = word2vec.load_clusters(output_clusters) assert clusters.vocab.shape == clusters.clusters.shape assert clusters.get_words_on_cluster(1).shape[0] > 10 # sanity check
model_1 = word2vec.Word2Vec.load(model_file) # 计算某个词的相关词列表 y2 = model_1.most_similar(u"吸烟", topn=19) # 39个最相关的 print(u"和吸烟最相关的词有:\n") for item in y2: print ("%s: %g" % (item[0], item[1])) print("-------------------------------\n") except Exception: print ("Exception") print ('word2vec_test end.') # word2vec_test() model = word2vec.load('cnword2vec.bin') word2vec.word2clusters('cuttedwords.txt','cncluster.txt', 100, verbose=True) clusters = word2vec.load_clusters('cncluster.txt') print(clusters) # clusters.vocab # print(clusters.get_words_on_cluster(90)[:10]) # model.clusters = clusters # indexes, metrics = model.analogy(pos=["吸","戒烟"], neg=["抽"]) # print(model.generate_response(indexes, metrics).tolist())
def preprocess(w2v_model,input_data, ind): sent_arr = [] train_set = [] train_set_x = [] train_set_y = [] fopen = open(input_data+ind+"_df") lines = fopen.readlines() fopen.close() cluster_model = word2vec.load_clusters(cluster_txt) unlogin_arr = [] word_arr = [] #load fan2jan dict fj_dic = pickle.load(open('fj_dic.pkl')) #load w2v #w2v_arr = pickle.load(open(input_data+ind+"_w2v.pkl")) w2v_arr = pickle.load(open(input_data+ind+'_x1_400.pkl')) #load pos tag pos_arr = pickle.load(open(input_data+"pos_"+ind+'.pkl')) if len(lines)!=len(w2v_arr) or len(lines)!=len(pos_arr): print 'wrong num of sample! word: %d w2v_arr: %d pos_arr: %d '%(len(lines),len(w2v_arr),len(pos_arr)) return None #cluster model #cluster_model = word2vec.load_clusters(cluster_txt) for i in xrange(len(lines)): if len(lines[i].split(' '))==3: arr = lines[i].split(" ") f = [0]*15 #label of classes based on word2vec c_vec = [0]*100 if arr[0] not in pu_arr : word_arr.append(arr[0]) f[1] = 1 if is_enum(arr[0])+is_cnum(arr[0])>0 else 0 f[2],f[3] =(prefix(arr[0]), suffix(arr[0])) try: f[4] = 1 if lines[i-1].split(' ')[0] in pu_arr else 0 except Exception, e: f[4] = 0 try: f[6] = 1 if lines[i+1].split(' ')[0] in pu_arr else 0 except Exception, e: f[6] = 0 f[5] = 1 if f[4]+f[6]==0 else 0 f[7+word_count(arr[0])] = 1 f[-1] = 1 if int(arr[1])==1 else -1 #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0])) pre_fvec,pre_fvec2,fvec,suf_fvec ,suf_fvec2= (None,None,None,None,None) pos_pre_vec ,pos_suf_vec= (None,None) pre_c_vec,c_vec,suf_c_vec = ([0]*100,[0]*100,[0]*100) try: pre_fvec = w2v_arr[i-1] #pre_fvec = w2v_model[lines[i-1].split(' ')[0]].tolist() pos_pre_vec = [0]*34 pos_pre_vec[int(pos_arr[i-1])] = 1 f[-2] = 1 if int(lines[i-1].split(' ')[1])==1 else -1 #pre_c_vec[cluster_model[lines[i-1].split(' ')]] = 1 except Exception, e: pre_fvec = [0]*200 pos_pre_vec = [0]*34 try: suf_fvec = w2v_arr[i+1] #suf_fvec = w2v_model[lines[i+1].split(' ')[0]].tolist() pos_suf_vec = [0]*34 pos_suf_vec[int(pos_arr[i+1])] = 1 f[-3] = 1 if int(lines[i+1].split(' ')[1]) == 1 else -1 #suf_c_vec[cluster_model[lines[i+1].split(' ')]] = 1 except Exception, e: suf_fvec = [0]*400 pos_suf_vec = [0]*34
word2vec.word2clusters('./txt_file/text8', './word2vectors/text8-clusters.txt', 100, verbose=True) """ 4. model模型的使用 """ model = word2vec.load('./word2vectors/text8.bin') print(model.vocab.size) print(model.vectors[0]) print(model['dog'][:10]) print(model.distance("dog", "cat", "fish")) indexes, metrics = model.similar("dog") print(model.vocab[indexes]) print(model.generate_response(indexes, metrics).tolist()) indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man']) print(model.generate_response(indexes, metrics).tolist()) """ 5. cluster模型的使用 """ clusters = word2vec.load_clusters('./word2vectors/text8-clusters.txt') print(clusters.get_words_on_cluster(90)[:10]) """ 6. cluster和model的结合使用 """ model.clusters = clusters indexes, metrics = model.analogy(pos=["paris", "germany"], neg=["france"]) print(model.generate_response(indexes, metrics).tolist())
def preprocess(w2v_model, input_data, ind): sent_arr = [] train_set = [] train_set_x = [] train_set_y = [] fopen = open(input_data + ind + "_df") lines = fopen.readlines() fopen.close() cluster_model = word2vec.load_clusters(cluster_txt) unlogin_arr = [] word_arr = [] #load fan2jan dict fj_dic = pickle.load(open('fj_dic.pkl')) #load w2v #w2v_arr = pickle.load(open(input_data+ind+"_w2v.pkl")) w2v_arr = pickle.load(open(input_data + ind + '_x1_400.pkl')) #load pos tag pos_arr = pickle.load(open(input_data + "pos_" + ind + '.pkl')) if len(lines) != len(w2v_arr) or len(lines) != len(pos_arr): print 'wrong num of sample! word: %d w2v_arr: %d pos_arr: %d ' % ( len(lines), len(w2v_arr), len(pos_arr)) return None #cluster model #cluster_model = word2vec.load_clusters(cluster_txt) for i in xrange(len(lines)): if len(lines[i].split(' ')) == 3: arr = lines[i].split(" ") f = [0] * 15 #label of classes based on word2vec c_vec = [0] * 100 if arr[0] not in pu_arr: word_arr.append(arr[0]) f[1] = 1 if is_enum(arr[0]) + is_cnum(arr[0]) > 0 else 0 f[2], f[3] = (prefix(arr[0]), suffix(arr[0])) try: f[4] = 1 if lines[i - 1].split(' ')[0] in pu_arr else 0 except Exception, e: f[4] = 0 try: f[6] = 1 if lines[i + 1].split(' ')[0] in pu_arr else 0 except Exception, e: f[6] = 0 f[5] = 1 if f[4] + f[6] == 0 else 0 f[7 + word_count(arr[0])] = 1 f[-1] = 1 if int(arr[1]) == 1 else -1 #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0])) pre_fvec, pre_fvec2, fvec, suf_fvec, suf_fvec2 = (None, None, None, None, None) pos_pre_vec, pos_suf_vec = (None, None) pre_c_vec, c_vec, suf_c_vec = ([0] * 100, [0] * 100, [0] * 100) try: pre_fvec = w2v_arr[i - 1] #pre_fvec = w2v_model[lines[i-1].split(' ')[0]].tolist() pos_pre_vec = [0] * 34 pos_pre_vec[int(pos_arr[i - 1])] = 1 f[-2] = 1 if int(lines[i - 1].split(' ')[1]) == 1 else -1 #pre_c_vec[cluster_model[lines[i-1].split(' ')]] = 1 except Exception, e: pre_fvec = [0] * 200 pos_pre_vec = [0] * 34 try: suf_fvec = w2v_arr[i + 1] #suf_fvec = w2v_model[lines[i+1].split(' ')[0]].tolist() pos_suf_vec = [0] * 34 pos_suf_vec[int(pos_arr[i + 1])] = 1 f[-3] = 1 if int(lines[i + 1].split(' ')[1]) == 1 else -1 #suf_c_vec[cluster_model[lines[i+1].split(' ')]] = 1 except Exception, e: suf_fvec = [0] * 400 pos_suf_vec = [0] * 34
import word2vec import os path_dataset = os.path.abspath('dataset/text8') path_clusters = os.path.abspath('dataset/text8.clusters') word2vec.word2clusters(path_dataset, path_clusters, 100) clusters = word2vec.load_clusters(path_clusters) print clusters
verbose=True) # In[ ]: word2vec.word2clusters('data/names.txt', 'data/names-clusters.txt', 100, verbose=True) # In[ ]: model = word2vec.load('data/names-model.bin') # In[ ]: clusters = word2vec.load_clusters('data/names-clusters.txt') # In[ ]: model.vocab # In[ ]: model.vectors.shape # In[ ]: indexes, metrics = model.analogy(pos=['snack'], neg=[], n=10) model.generate_response(indexes, metrics).tolist() # In[ ]:
print(indexes, metrics) # to get those words retreived print(model.vocab[indexes]) # There is a helper function to create a combined response as a numpy record array model.generate_response(indexes, metrics) # to make that numpy array a pure python response model.generate_response(indexes, metrics).tolist() # Its possible to do more complex queries like analogies such as: king - man + woman = queen This method returns # the same as cosine the indexes of the words in the vocab and the metric indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man']) print(indexes, metrics) clusters = word2vec.load_clusters( '/home/guohf/AI_tutorial/ch8/data/text8-clusters.txt') # get the cluster number for individual words print(clusters.vocab) # We can see get all the words grouped on an specific cluster print(clusters.get_words_on_cluster(90).shape) print(clusters.get_words_on_cluster(90)[:10]) # add the clusters to the word2vec model and generate a response that includes the clusters model.clusters = clusters indexes, metrics = model.analogy(pos=["paris", "germany"], neg=["france"]) model.generate_response(indexes, metrics).tolist()