Example #1
0
 def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
     super(ScikitRE, self).__init__()
     self.modelname = relationtype + "_" + modelname
     self.relationtype = relationtype
     self.pairtype = relationtype
     self.corpus = corpus
     self.pairs = []
     self.features = []
     self.labels = []
     self.pred = []
     self.clusters = word2vec.load_clusters(
         "corpora/Thaliana/documents-processed-clusters.txt")
     self.posfmeasure = make_scorer(f1_score,
                                    average='binary',
                                    pos_label=True)
     self.generate_data(corpus, modelname, relationtype)
     self.text_clf = Pipeline([
         ('vect',
          CountVectorizer(analyzer='char_wb',
                          ngram_range=(3, 20),
                          min_df=0.0,
                          max_df=0.7)),
         #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
         #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
         #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
         #('clf', SGDClassifier())
         #('clf', svm.NuSVC(nu=0.01 ))
         #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
         ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
         #('clf', DummyClassifier(strategy="constant", constant=True))
     ])
Example #2
0
def main():

    print('== PREDICTIONS ==')

    model = word2vec.load('./text8.bin')
    print("model.vocab")
    print(model.vocab)
    print("model.vectors.shape")
    print(model.vectors.shape)
    print("model.vectors")
    print(model.vectors)
    print("model['dog'].shape")
    print(model['dog'].shape)
    print("model['dog'][:10]")
    print(model['dog'][:10])
    print("model.distance('dog', 'cat', 'fish')")
    print(model.distance('dog', 'cat', 'fish'))

    print('== SIMILARITY ==')

    indexes, metrics = model.similar('dog')
    print('indexes')
    print(indexes)
    print('metrics')
    print(metrics)
    print("model.vocab[indexes]")
    print(model.vocab[indexes])
    print("model.generate_response(indexes, metrics)")
    print(model.generate_response(indexes, metrics))
    print("model.generate_response(indexes, metrics).tolist()")
    print(model.generate_response(indexes, metrics).tolist())

    print('== PHRASES ==')
    indexes, metrics = model.similar('los_angeles')
    print('model.generate_response(indexes, metrics).tolist()')
    print(model.generate_response(indexes, metrics).tolist())

    print('== ANALOGIES ==')

    indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'])
    print('indexes')
    print(indexes)
    print('metrics')
    print(metrics)
    print('model.generate_response(indexes, metrics).tolist()')
    print(model.generate_response(indexes, metrics).tolist())

    print('== CLUSTERS ==')
    clusters = word2vec.load_clusters('./text8-clusters.txt')
    print('clusters.vocab')
    print(clusters.vocab)
    print('clusters.get_words_on_cluster(90).shape')
    print(clusters.get_words_on_cluster(90).shape)
    print('clusters.get_words_on_cluster(90).shape[:10]')
    print(clusters.get_words_on_cluster(90)[:10])
    model.clusters = clusters
    indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'])
    print('model.generate_response(indexes, metrics).tolist()')
    print(model.generate_response(indexes, metrics).tolist())
Example #3
0
def testWord2Cluter():
    """Cluster"""
    # Do the clustering of the vectors based on the trained model.
    # That created a text8-clusters.txt with the cluster for every word in the vocabulary
    word2vec.word2clusters('/D/test/text8/text8', '/D/test/text8/text8-clusters', 100, verbose=True)
    clusters = word2vec.load_clusters('/Users/drodriguez/Downloads/text8-clusters')
    print clusters['dog']
    print clusters.get_words_on_cluster(90).shape
    print clusters.get_words_on_cluster(90)[:10]
Example #4
0
def create_model():
	in_file = open(sys.argv[1])
	out_file = open(sys.argv[2],"w")
	json_data = json.load(in_file)
	final_hash = {}
	model = word2vec.load(sys.argv[3])
	clusters = word2vec.load_clusters(sys.argv[4])

	for loc in json_data:
		count = 0
		keywords = []
		final_hash[loc] = {}
		final_hash[loc]["doc_length"] = json_data[loc]["len"]	
		final_hash[loc]["keywords"] = []
		final_hash[loc]["centroids"] = []
		word_vectors = {}	#"word" => [vector]
		word_clusters = {}	#"cluster_no" => [words]
		cluster_centroids = {}
		for word in json_data[loc]["keywords"]:
			if len(word.split()) > 1:
				continue
			count += 1
			try:
				vec = model[word]
				cluster_no = clusters[word]
			except KeyError:
				#print("No entry in word2vec for " + word)
				continue
			word_vectors[word] = vec
			
			if cluster_no not in word_clusters:
				word_clusters[cluster_no] = []
				cluster_centroids[cluster_no] = len(vec)*[0.0]
			word_clusters[cluster_no].append(word)
			for i in range(len(vec)):
				cluster_centroids[cluster_no][i] += word_vectors[word][i]
		for cluster_no in word_clusters:
			cluster_len = len(word_clusters[cluster_no])
			for i in range(len(cluster_centroids[cluster_no])):
				cluster_centroids[cluster_no][i] = cluster_centroids[cluster_no][i] / cluster_len
	
		for cluster_no in word_clusters:
			keys = []
			for word in word_clusters[cluster_no]:
				keys.append((word,json_data[loc]["keywords"][word]))
			final_hash[loc]["keywords"].append(keys)
			final_hash[loc]["centroids"].append(cluster_centroids[cluster_no])		
		#print(" Total keywords in " + loc + " : " + str(count))
		#print(" Total word vectors in " + loc + " : " + str(len(word_vectors)))	
		

	
	json.dump(final_hash,out_file)
Example #5
0
def test_model_with_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    model = word2vec.load(output_txt)
    assert clusters.vocab.shape == model.vocab.shape

    model.clusters = clusters
    indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30)
    assert indexes.shape == (30, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 3
Example #6
0
def test_model_with_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    model = word2vec.load(output_bin)
    assert clusters.vocab.shape == model.vocab.shape

    model.clusters = clusters
    indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30)
    assert indexes.shape == (30,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 3
Example #7
0
def main():
    """Main method."""
    k = 35

    # write ground truth vocabulary to gt_input.txt and get ground truth
    # dictionary
    ldict = aggregate_input_and_ground_truths()
    logging.info("Done generating ldict and ground truth text file.")

    # if file containing clusters hasn't already been created, create it
    if not os.path.isfile("./clusters.txt"):

        preprocess()

        # train word2vec and cluster output from the full vocab
        word2vec.word2clusters("./text8-phrases-extra", "./clusters.txt", k, verbose=True, min_count=1)

        logging.info("Done training.")
        logging.info("Done creating clusters.")

    # load clusters
    clusters = word2vec.load_clusters("./clusters.txt")

    # build cluster dictionary from full vocabulary
    cdict = {}
    for i in range(0, k):
        for word in clusters.get_words_on_cluster(i):
            cdict[word] = set([i])

    logging.info("Done generating cdict.")

    # trim cluster dictionary down to only keys included in ground truths
    trimmed_cdict = {}
    for key in ldict.keys():
        try:
            trimmed_cdict[key] = cdict[key]
        except:
            pass

    logging.info("done trimming cdict; begining scoring\n")

    # compute bcubed score
    precision = bcubed.precision(trimmed_cdict, ldict)
    recall = bcubed.recall(trimmed_cdict, ldict)
    fscore = bcubed.fscore(precision, recall)

    print "precision: {p}, \t recall: {r}, \t fscore: {f}".format(p=precision, r=recall, f=fscore)

    logging.info("done scoring\n")
Example #8
0
def pre_process(model, input_data):
    sent_arr = []
    train_set = []
    train_set_x = []
    train_set_y = []
    fopen = open(input_data)
    lines = fopen.readlines()
    fopen.close()
    unlogin_arr = []
    word_arr = []
    #cluster model
    cluster_model = word2vec.load_clusters(cluster_txt)

    for i in xrange(len(lines)):
        if lines[i] != '\n':
            arr = lines[i].split(" ")
            f = [0] * 15
            #label of classes based on word2vec
            c_vec = [0] * 100
            if arr[0] not in pu_arr:
                word_arr.append(arr[0])
                f[1] = 1 if is_enum(arr[0]) + is_cnum(arr[0]) > 0 else 0
                f[2], f[3] = (prefix(arr[0]), suffix(arr[0]))
                try:
                    f[4] = 1 if lines[i - 1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[4] = 0
                try:
                    f[6] = 1 if lines[i + 1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[6] = 0
                f[5] = 1 if f[4] + f[6] == 0 else 0
                f[7 + word_count(arr[0])] = 1
                #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0]))
                pre_fvec, pre_fvec2, fvec, suf_fvec, suf_fvec2 = (None, None,
                                                                  None, None,
                                                                  None)
                try:
                    pre_fvec = model[lines[i - 1].split(" ")[0]].tolist()
                except Exception, e:
                    pre_fvec = [0] * 400
                try:
                    fvec = model[arr[0]].tolist()
                except Exception, e:
                    fvec = [0] * 400
                    unlogin_arr.append(arr[0])
Example #9
0
def pre_process(model, input_data):
    sent_arr = []
    train_set = []
    train_set_x = []
    train_set_y = []
    fopen = open(input_data)
    lines = fopen.readlines()
    fopen.close()
    unlogin_arr = []
    word_arr = []
    #cluster model
    cluster_model = word2vec.load_clusters(cluster_txt)

    for i in xrange(len(lines)):
        if lines[i]!='\n':
            arr = lines[i].split(" ")
            f = [0]*15
            #label of classes based on word2vec
            c_vec = [0]*100
            if arr[0] not in pu_arr : 
                word_arr.append(arr[0])
                f[1] = 1 if is_enum(arr[0])+is_cnum(arr[0])>0 else 0
                f[2],f[3] =(prefix(arr[0]), suffix(arr[0]))
                try:
                    f[4] = 1 if lines[i-1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[4] = 0
                try:
                    f[6] = 1 if lines[i+1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[6] = 0
                f[5] = 1 if f[4]+f[6]==0 else 0
                f[7+word_count(arr[0])] = 1
                #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0]))
                pre_fvec,pre_fvec2,fvec,suf_fvec ,suf_fvec2= (None,None,None,None,None)
                try:
                    pre_fvec = model[lines[i-1].split(" ")[0]].tolist()
                except Exception, e:
                    pre_fvec = [0]*400
                try:
                    fvec = model[arr[0]].tolist()
                except Exception, e:
                    fvec = [0]*400
                    unlogin_arr.append(arr[0])
Example #10
0
def active_learn_main(engine, initial_term, user_id, concept_id=False):
    '''
        engine is 
        initial_term is
        user_id is 
        concept_id is 
    '''
    
    #user will select a term and then the term will be run through the word2vec model to come up with similar terms
    #if it is an existing concept pull the existing data from db else start from scratch
    if concept_id:
        term_list = engine.execute(select([ConceptTerms.c.term]).where(ConceptTerms.c.concept_id
                                                            == concept_id))
        term_exc = engine.execute(select([ConceptTerms_reject.c.term]).where(ConceptTerms_reject.c.concept_id
                                                            == concept_id))
        pred_list = engine.execute(select([ConceptPredictors.c.predictor]).where(ConceptPredictors.c.concept_id
                                                            == concept_id))
        pred_exc = engine.execute(select([ConceptPredictorsReject.c.predictor]).where(ConceptPredictorsReject.c.concept_id
                                                            == concept_id))
    else:
        term_list = set([initial_term])
        term_exc = set()
        pred_list = set()
        pred_exc = set()


    #load in model
    #model = word2vec.load('/groups/clinicaltrials/clinicaltrials/data/criteria.bin')
    #clusters = word2vec.load_clusters('/groups/clinicaltrials/clinicaltrials/data/criteria-clusters.txt')
    model = word2vec.load('../data/criteria.bin')
    clusters = word2vec.load_clusters('../data/criteria-clusters.txt')

    # add clusters to model
    model.clusters = clusters
    
    #add skip terms to term_exc and pred_exc
    skip_term, skip_pred = skip_terms()
    term_exc.update(skip_term)
    pred_exc.update(skip_pred)

    term_list, pred_list = run_active_learning(term_list, term_exc, pred_list, pred_exc, engine, concept_id, user_id, model)
Example #11
0
 def __init__(self, corpus, ptype, test=False, modelname="crfre_classifier"):
     super(CrfSuiteRE, self).__init__()
     self.data = []
     self.labels = []
     self.scores = []
     self.predicted = []
     self.entities = []
     self.pairtype = ptype
     self.modelname = ptype + "_" + modelname
     self.gold_relations = set()
     self.tair_pairs = load_tair_relations()
     self.vecmodel = word2vec.load("corpora/Thaliana/documents-processed" + '.bin')
     with codecs.open("seedev_relation.txt", 'r', 'utf-8') as relfile:
         for r in relfile:
             self.gold_relations.add(r.strip())
     self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
     #with codecs.open("corpora/Thaliana/documents-clusters.txt", "r", "utf-8") as clusterfile:
     #    for l in clusterfile:
     #        values = l.strip().split(" ")
     #        self.clusters[values[0]] = values[1]
     self.generate_data(corpus, self.modelname, ptype, test)
Example #12
0
 def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
     super(ScikitRE, self).__init__()
     self.modelname = relationtype + "_" + modelname
     self.relationtype = relationtype
     self.pairtype = relationtype
     self.corpus = corpus
     self.pairs = []
     self.features = []
     self.labels = []
     self.pred = []
     self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
     self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
     self.generate_data(corpus, modelname, relationtype)
     self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                               #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                               #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                               #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                               #('clf', SGDClassifier())
                               #('clf', svm.NuSVC(nu=0.01 ))
                                #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                               ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                               #('clf', DummyClassifier(strategy="constant", constant=True))
                              ])
from nltk.corpus import stopwords
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import word2vec

LABELED_PATH = r'/home/apple/data/kaggle/senti_analysis/labeledTrainData.tsv'
TEST_PATH = r'/home/apple/data/kaggle/senti_analysis/testData.tsv'
BIN_PATH = '/home/apple/data/kaggle/senti_analysis/senti.bin'
CLUSTER_PATH = '/home/apple/data/kaggle/senti_analysis/senti-clusters.txt'

cluster_num = 100
clusters = word2vec.load_clusters(CLUSTER_PATH)

def clean_text(text, remove_stopwords=False):
    # 1. Remove HTML
    review_text = BeautifulSoup(text).get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
Example #14
0
#word2vec.word2clusters('../data/text8', '../data/test8-cluster.txt', classes=10, size=100, verbose=True)

## predictions
model = word2vec.load('../data/text8.bin')
model.vocab  ## 训练预料的向量表示
model.vectors  ## model的向量表示, model.vectors.shape
model.clusters  ## model的那个cluster模型

## 1) model.analogy(pos, neg, n=10)
##	  分析词+-之后的相似词,比如postive = ['king', 'woman']; negative = ['man']
##	  index, metrix = model.analogy(pos, neg, n=10)
##	  model.generate_response(index, metrix).tolist()

## 2) model.cosine(word, n=10)
##    分析单个词的相似词
##	  index, metrix = model.cosine('dog')
##    model.generate_response(index, metrix).tolist()

## 3) model['dog'] 等同于 model.get_vector('dog')
##	  获取某个词的词向量表示

## clusters
cluster = word2vec.load_clusters('../data/test8-cluster.txt')
## 1) model.get_cluster(word) 等同于 model['word']
##	  找到某个词所在的类
##    cluster_num = cluster.get_cluster('dog')
## 2) model.get_words_on_cluster(num)
##    找到某个类下的词
##    print cluster_num
#     print cluster.get_words_on_cluster(cluster_num)
Example #15
0
import sys

if len(sys.argv) > 1 and sys.argv[1] in ['-t', '-train']:

    # Add new articles to file
    clean_articles.clean()

    # Train new model
    w2v.word2phrase('combined', './text-phrases', verbose=True)
    w2v.word2vec('text8-phrases', 'text.bin', size=100, verbose=True)
    w2v.word2clusters('combined', 'text-clusters.txt', 100, verbose=True)

# Initialize pre-trained model
model_old = w2v.load('text8.bin')
model = w2v.load('text.bin')
clusters = w2v.load_clusters('text-clusters.txt')
model.clusters = clusters

#ind = clusters['Trump']
#print(clusters.get_words_on_cluster(ind))
print(len(model_old.vocab))
print(len(model.vocab))

# King - man + woman : "Man is to King as Woman is to
# Trump - America + Germany
pos = ['Putin', 'America']
neg = ['Russia']
leader = model.analogy(pos, neg)

print()
print("{0} is to {1} as {2} is to ...".format(neg[0], pos[0], pos[1]))
Example #16
0
indexes, metrics = model.cosine('highest')
print(model.generate_response(indexes,
                              metrics).tolist())  # lowest, ranking, higher

# ********************************************
# Analogies
# Its possible to do more complex queries like analogies such as: king - man + woman = queen
# This method returns the same as cosine the indexes of the words in the vocab and the metric

indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)

# **************************************************************************

# ****************************** now look at the cluster
# ********* In cluster file, you can find each word has a number. it presents cluster number
clusters = word2vec.load_clusters('text8-clusters.txt')
print(clusters.get_words_on_cluster(
    90).shape)  # 255.  It means 255 words in this cluster
print(clusters.get_words_on_cluster(90))  # It prints all words of the cluster

print(clusters[b'the'])  # 49. 'The' is belong to 49 cluster
print(clusters.get_words_on_cluster(49))
print(clusters.get_words_on_cluster(49)[:10])  # showing first 10 words

# We can add the clusters to the word2vec model and generate a response that includes the clusters

model.clusters = clusters
indexes, metrics = model.analogy(pos=['paris', 'germany'],
                                 neg=['france'],
                                 n=10)
print(model.generate_response(indexes, metrics).tolist())
Example #17
0
def get_model(filename, clusters=False):
    if clusters:
        return word2vec.load_clusters(filename)
    else:
        return word2vec.load(filename)
#2. Predictions(預測過程):
model = word2vec.load('text8.bin')
#print model.vocab
#print model.vectors.shape
#print model.vectors
#print model['dog'].shape
#print model['dog'][:10]
#indexes, metrics = model.cosine('socks')
#print indexes, metrics
#print model.vocab[indexes]
#print model.generate_response(indexes, metrics)
#print model.generate_response(indexes, metrics).tolist()

#3. Phrases(相位):
#indexes, metrics = model.cosine('los_angeles')
#print model.generate_response(indexes, metrics).tolist()

#4.Analogies(類似):
#indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)
#print indexes, metrics
#print model.generate_response(indexes, metrics).tolist()

#Clusters(分群):
clusters = word2vec.load_clusters('text8-clusters.txt')
#print clusters['dog']
#print clusters.get_words_on_cluster(90).shape
#print clusters.get_words_on_cluster(90)[:10]
model.clusters = clusters
indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'], n=10)
print model.generate_response(indexes, metrics).tolist()
from db_manager import *
import word2vec

SQL_DB = 'sqlite:///clean_chronicling_america.sqlite'
WORDS_FILE = 'docs'
PHRASES_FILE = 'docs-phrases'
BIN_FILE = 'docs.bin'
CLUSTER_FILE = 'docs-clusters.txt'


def dump_text():
    session = connect(SQL_DB)
    r = session.execute('select ocr_eng from articles')

    with open(WORDS_FILE, 'w') as f:
        for i, doc in enumerate(r):
            f.write(doc[0] + '\n')

if __name__=='__main__':
    # word2vec.word2phrase(WORDS_FILE, PHRASES_FILE, verbose=True)
    # word2vec.word2vec(PHRASES_FILE, BIN_FILE, size=100, verbose=True)
    # word2vec.word2clusters(WORDS_FILE, CLUSTER_FILE, 100, verbose=True)
    # model = word2vec.load(BIN_FILE)
    # print(model.vocab)
    # print(model.vectors.shape)
    clusters = word2vec.load_clusters(CLUSTER_FILE)
    print(clusters)
Example #20
0
def test_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    assert clusters.vocab.shape == clusters.clusters.shape
    assert clusters.get_words_on_cluster(1).shape[0] > 10  # sanity check
Example #21
0
        model_1 = word2vec.Word2Vec.load(model_file)

        # 计算某个词的相关词列表
        y2 = model_1.most_similar(u"吸烟", topn=19)  # 39个最相关的
        print(u"和吸烟最相关的词有:\n")
        for item in y2:
            print ("%s: %g" % (item[0], item[1]))
        print("-------------------------------\n")
    except Exception:
        print ("Exception")
    print ('word2vec_test end.')

# word2vec_test()


model = word2vec.load('cnword2vec.bin')


word2vec.word2clusters('cuttedwords.txt','cncluster.txt', 100, verbose=True)
clusters = word2vec.load_clusters('cncluster.txt')
print(clusters)
# clusters.vocab
# print(clusters.get_words_on_cluster(90)[:10])

# model.clusters = clusters
# indexes, metrics = model.analogy(pos=["吸","戒烟"], neg=["抽"])
# print(model.generate_response(indexes, metrics).tolist())



Example #22
0
def preprocess(w2v_model,input_data, ind):
    sent_arr = []
    train_set = []
    train_set_x = []
    train_set_y = []
    fopen = open(input_data+ind+"_df")
    lines = fopen.readlines()
    fopen.close()
    cluster_model = word2vec.load_clusters(cluster_txt)
    unlogin_arr = []
    word_arr = []
    #load fan2jan dict
    fj_dic = pickle.load(open('fj_dic.pkl'))
    #load w2v
    #w2v_arr = pickle.load(open(input_data+ind+"_w2v.pkl"))
    w2v_arr = pickle.load(open(input_data+ind+'_x1_400.pkl'))
    #load pos tag
    pos_arr = pickle.load(open(input_data+"pos_"+ind+'.pkl'))
    if len(lines)!=len(w2v_arr) or len(lines)!=len(pos_arr):
        print 'wrong num of sample! word: %d w2v_arr: %d pos_arr: %d '%(len(lines),len(w2v_arr),len(pos_arr))
        return None
    #cluster model
    #cluster_model = word2vec.load_clusters(cluster_txt)

    for i in xrange(len(lines)):
        if len(lines[i].split(' '))==3:
            arr = lines[i].split(" ")
            f = [0]*15
            #label of classes based on word2vec
            c_vec = [0]*100
            if arr[0] not in pu_arr : 
                word_arr.append(arr[0])
                f[1] = 1 if is_enum(arr[0])+is_cnum(arr[0])>0 else 0
                f[2],f[3] =(prefix(arr[0]), suffix(arr[0]))
                try:
                    f[4] = 1 if lines[i-1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[4] = 0
                try:
                    f[6] = 1 if lines[i+1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[6] = 0
                f[5] = 1 if f[4]+f[6]==0 else 0
                f[7+word_count(arr[0])] = 1
                
                f[-1] = 1 if int(arr[1])==1 else -1
                #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0]))
                pre_fvec,pre_fvec2,fvec,suf_fvec ,suf_fvec2= (None,None,None,None,None)
                pos_pre_vec ,pos_suf_vec= (None,None)
                pre_c_vec,c_vec,suf_c_vec = ([0]*100,[0]*100,[0]*100)
                try:
                    pre_fvec = w2v_arr[i-1]
                    #pre_fvec = w2v_model[lines[i-1].split(' ')[0]].tolist()
                    pos_pre_vec = [0]*34
                    pos_pre_vec[int(pos_arr[i-1])] = 1
                    f[-2] = 1 if int(lines[i-1].split(' ')[1])==1 else -1
                    #pre_c_vec[cluster_model[lines[i-1].split(' ')]] = 1
                except Exception, e:
                    pre_fvec = [0]*200
                    pos_pre_vec = [0]*34
                
                try:
                    suf_fvec = w2v_arr[i+1]
                    #suf_fvec = w2v_model[lines[i+1].split(' ')[0]].tolist()
                    pos_suf_vec = [0]*34
                    pos_suf_vec[int(pos_arr[i+1])] = 1
                    f[-3] = 1 if int(lines[i+1].split(' ')[1]) == 1 else -1
                    #suf_c_vec[cluster_model[lines[i+1].split(' ')]] = 1
                except Exception, e:
                    suf_fvec = [0]*400
                    pos_suf_vec = [0]*34
Example #23
0
word2vec.word2clusters('./txt_file/text8',
                       './word2vectors/text8-clusters.txt',
                       100,
                       verbose=True)
"""
4. model模型的使用
"""

model = word2vec.load('./word2vectors/text8.bin')
print(model.vocab.size)
print(model.vectors[0])
print(model['dog'][:10])
print(model.distance("dog", "cat", "fish"))

indexes, metrics = model.similar("dog")
print(model.vocab[indexes])
print(model.generate_response(indexes, metrics).tolist())
indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'])
print(model.generate_response(indexes, metrics).tolist())
"""
5. cluster模型的使用
"""
clusters = word2vec.load_clusters('./word2vectors/text8-clusters.txt')
print(clusters.get_words_on_cluster(90)[:10])
"""
6. cluster和model的结合使用
"""

model.clusters = clusters
indexes, metrics = model.analogy(pos=["paris", "germany"], neg=["france"])
print(model.generate_response(indexes, metrics).tolist())
Example #24
0
def test_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    assert clusters.vocab.shape == clusters.clusters.shape
    assert clusters.get_words_on_cluster(1).shape[0] > 10    # sanity check
Example #25
0
def preprocess(w2v_model, input_data, ind):
    sent_arr = []
    train_set = []
    train_set_x = []
    train_set_y = []
    fopen = open(input_data + ind + "_df")
    lines = fopen.readlines()
    fopen.close()
    cluster_model = word2vec.load_clusters(cluster_txt)
    unlogin_arr = []
    word_arr = []
    #load fan2jan dict
    fj_dic = pickle.load(open('fj_dic.pkl'))
    #load w2v
    #w2v_arr = pickle.load(open(input_data+ind+"_w2v.pkl"))
    w2v_arr = pickle.load(open(input_data + ind + '_x1_400.pkl'))
    #load pos tag
    pos_arr = pickle.load(open(input_data + "pos_" + ind + '.pkl'))
    if len(lines) != len(w2v_arr) or len(lines) != len(pos_arr):
        print 'wrong num of sample! word: %d w2v_arr: %d pos_arr: %d ' % (
            len(lines), len(w2v_arr), len(pos_arr))
        return None
    #cluster model
    #cluster_model = word2vec.load_clusters(cluster_txt)

    for i in xrange(len(lines)):
        if len(lines[i].split(' ')) == 3:
            arr = lines[i].split(" ")
            f = [0] * 15
            #label of classes based on word2vec
            c_vec = [0] * 100
            if arr[0] not in pu_arr:
                word_arr.append(arr[0])
                f[1] = 1 if is_enum(arr[0]) + is_cnum(arr[0]) > 0 else 0
                f[2], f[3] = (prefix(arr[0]), suffix(arr[0]))
                try:
                    f[4] = 1 if lines[i - 1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[4] = 0
                try:
                    f[6] = 1 if lines[i + 1].split(' ')[0] in pu_arr else 0
                except Exception, e:
                    f[6] = 0
                f[5] = 1 if f[4] + f[6] == 0 else 0
                f[7 + word_count(arr[0])] = 1

                f[-1] = 1 if int(arr[1]) == 1 else -1
                #f[1],f[2],f[3],f[4] = (is_enum(arr[0]), is_cnum(arr[0]),prefix(arr[0]), suffix(arr[0]))
                pre_fvec, pre_fvec2, fvec, suf_fvec, suf_fvec2 = (None, None,
                                                                  None, None,
                                                                  None)
                pos_pre_vec, pos_suf_vec = (None, None)
                pre_c_vec, c_vec, suf_c_vec = ([0] * 100, [0] * 100, [0] * 100)
                try:
                    pre_fvec = w2v_arr[i - 1]
                    #pre_fvec = w2v_model[lines[i-1].split(' ')[0]].tolist()
                    pos_pre_vec = [0] * 34
                    pos_pre_vec[int(pos_arr[i - 1])] = 1
                    f[-2] = 1 if int(lines[i - 1].split(' ')[1]) == 1 else -1
                    #pre_c_vec[cluster_model[lines[i-1].split(' ')]] = 1
                except Exception, e:
                    pre_fvec = [0] * 200
                    pos_pre_vec = [0] * 34

                try:
                    suf_fvec = w2v_arr[i + 1]
                    #suf_fvec = w2v_model[lines[i+1].split(' ')[0]].tolist()
                    pos_suf_vec = [0] * 34
                    pos_suf_vec[int(pos_arr[i + 1])] = 1
                    f[-3] = 1 if int(lines[i + 1].split(' ')[1]) == 1 else -1
                    #suf_c_vec[cluster_model[lines[i+1].split(' ')]] = 1
                except Exception, e:
                    suf_fvec = [0] * 400
                    pos_suf_vec = [0] * 34
Example #26
0
import word2vec
import os

path_dataset = os.path.abspath('dataset/text8')
path_clusters = os.path.abspath('dataset/text8.clusters')

word2vec.word2clusters(path_dataset, path_clusters, 100)
clusters = word2vec.load_clusters(path_clusters)

print clusters
Example #27
0
                  verbose=True)

# In[ ]:

word2vec.word2clusters('data/names.txt',
                       'data/names-clusters.txt',
                       100,
                       verbose=True)

# In[ ]:

model = word2vec.load('data/names-model.bin')

# In[ ]:

clusters = word2vec.load_clusters('data/names-clusters.txt')

# In[ ]:

model.vocab

# In[ ]:

model.vectors.shape

# In[ ]:

indexes, metrics = model.analogy(pos=['snack'], neg=[], n=10)
model.generate_response(indexes, metrics).tolist()

# In[ ]:
print(indexes, metrics)

# to get those words retreived
print(model.vocab[indexes])

# There is a helper function to create a combined response as a numpy record array
model.generate_response(indexes, metrics)

# to make that numpy array a pure python response
model.generate_response(indexes, metrics).tolist()

# Its possible to do more complex queries like analogies such as: king - man + woman = queen This method returns
# the same as cosine the indexes of the words in the vocab and the metric
indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'])
print(indexes, metrics)

clusters = word2vec.load_clusters(
    '/home/guohf/AI_tutorial/ch8/data/text8-clusters.txt')

# get the cluster number for individual words
print(clusters.vocab)

# We can see get all the words grouped on an specific cluster
print(clusters.get_words_on_cluster(90).shape)
print(clusters.get_words_on_cluster(90)[:10])

# add the clusters to the word2vec model and generate a response that includes the clusters
model.clusters = clusters
indexes, metrics = model.analogy(pos=["paris", "germany"], neg=["france"])
model.generate_response(indexes, metrics).tolist()