def __init__(self, lp , PROGRAM_Halt, wv_fileaddress, max_rank): self.NOT_FOUND = 1 self.lp = lp self.PROGRAM_Halt = PROGRAM_Halt import numpy try: from wvlib import wvlib except: import wvlib try: self.lp ("Loading word-embeddings matrix from file:" + wv_fileaddress + "\nmax_rank: "+str(max_rank)) self.__w2v_model = wvlib.load (wv_fileaddress , max_rank=max_rank) except Exception as E: self.PROGRAM_Halt ("Error loading word2vec mode.\nError:"+E.message) self.lp ("Normalizing word-embeddings matrix.") self.__w2v_model.normalize() self.__words = [u'_<_MASK__>_', u'_<_OOV_>_'] + [w.decode('utf-8') for w in self.__w2v_model.words()] #use if error: self.__words = [u'_<_MASK__>_', u'_<_OOV_>_'] + [w.decode('utf-8','ignore') for w in self.__w2v_model.words()] self.__word_index_dict = {w:i for i, w in enumerate(self.__words)} latent_dim = self.__w2v_model._vectors.vectors[0].shape[0] self.__weights = numpy.concatenate([numpy.zeros((1,latent_dim)), numpy.random.randn(1,latent_dim), numpy.asarray(self.__w2v_model.vectors())])
os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/RW-STANFORD', 'EN-RW-STANFORD.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/VERB-143', 'EN-VERB-143.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-REL', 'EN-WS-353-REL.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-SIM', 'EN-WS-353-SIM.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-ALL', 'EN-WS-353-ALL.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/YP-130', 'EN-YP-130.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/MayoSRS', 'MayoSRS.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-sim.txt'),\ os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-rel.txt')] from tools import utilities as util config = CommandLine() #from word2Vec import tools as util if os.path.isfile(config.inputFile): try: wv = wvlib.load(config.inputFile).normalize() #references = [(r, eva.read_referenceSingleWords(r)) for r in evafilePath] references = [(r, eva.read_reference(r)) for r in evafilePath] print '%20s\trho\tmissed\ttotal\tratio' % 'dataset' for name, ref in references: #rho, count = eva.evaluateTest(newWordVecs, ref,wordList) rho, count = eva.evaluate(wv, ref) total, miss = len(ref), len(ref) - count print '%20s\t%.4f\t%d\t%d\t(%.2f%%)' % \ (eva.baseroot(name), rho, miss, total, 100.*miss/total) except FormatError: print "skip", config.inputFile else: folderList = util.get_filepaths(config.inputFile) for i, item in enumerate(folderList): filename, file_extension = os.path.splitext(item)
if __name__ =='__main__': #filePath=os.path.join(os.path.dirname(__file__), 'w2vData', 'PubMed15_Dependancy1.txt') #PubMed-w2v.bin #PubMed-and-PMC-w2v.bin evafilePath=[os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/MayoSRS', 'MayoSRS.txt'), os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-sim.txt'),\ os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-rel.txt')] from tools import utilities as util config = CommandLine() #from word2Vec import tools as util if os.path.isfile(config.inputFile): try: wv = wvlib.load(config.inputFile).normalize() #references = [(r, eva.read_referenceSingleWords(r)) for r in evafilePath] references = [(r, eva.read_reference(r)) for r in evafilePath] print '%20s\trho\tmissed\ttotal\tratio' % 'dataset' for name, ref in references: #rho, count = eva.evaluateTest(newWordVecs, ref,wordList) rho, count = eva.evaluate(wv, ref) total, miss = len(ref), len(ref) - count print '%20s\t%.4f\t%d\t%d\t(%.2f%%)' % \ (eva.baseroot(name), rho, miss, total, 100.*miss/total) except FormatError: print "skip",config.inputFile else: folderList=util.get_filepaths(config.inputFile) for i,item in enumerate(folderList): filename, file_extension = os.path.splitext(item)
if '-f' in opts: self.fname = True def printHelp(self): help = __doc__.replace('<PROGNAME>',sys.argv[0],1) print >> sys.stderr, help exit() if __name__ == "__main__": config = CommandLine() #from gensim.models import word2vec w2v1 = wv.load(config.inputFile).normalize() # orginal w2v #print w2v1['his'] w2v1= dict(w2v1) print len(w2v1.keys()) w2v2 = wv.load(config.retroVector).normalize() # symetric w2v w2v2= dict(w2v2) #print "size of w2v 2:",len(w2v2) seta=set(w2v1.keys()) setb=set(w2v2.keys()) intersection = seta.intersection(setb) newWordvector = w2v1.copy()
def get_data(limit=100000): #Let us say the point of this little program is to get the data, #get the vectors and the create both embedding matrix #and the data in nice indexes #wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=1000000) wv = wvlib.load("/home/ginter/w2v/pb34_lemma_200_v2.bin").normalize()#,max_rank=10000000000).normalize() #wv.normalize() #remember to normalize! lines2 = open('./example_harvest/the_res', 'rt').readlines()[:50] lines = open('test_sent_2.txt', 'rt').readlines() lines += lines2 #Such a small vocab I can ignore this stuff: vocab_set = set() vocab_list = [] vecs = [] examples = [] corrupt_examples = [] labels = [] incomplete = [] triplets = set() for line in lines[1:]: if len(line) > 4: exp = line.strip() indexes = [] for w in exp.split()[:-1]: success = True if w not in vocab_list: vocab_list.append(w) try: vecs.append(wv.word_to_vector(w.decode('utf8'))) except: vecs.append(numpy.zeros(200,)) #print w incomplete.append(w) indexes.append(vocab_list.index(w)) destination_ok = True for w in exp.split()[:-1]: if w in incomplete: #print '!', w destination_ok = False if destination_ok: try: if '.'.join([str(indexes[0]), str(indexes[1]), str(indexes[2])]) not in triplets: examples.append((indexes, exp.split()[-1])) print len(examples) triplets.add('.'.join([str(indexes[0]), str(indexes[1]), str(indexes[2])])) if len(examples) > limit: break except: print indexes else: corrupt_examples.append((indexes, exp.split()[-1])) return examples, vocab_list, vecs
def train_vectors(data, vec_size=50, window_size=2): wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=400000) #Let's not normalize, just for kicks #wv = wv.normalize() rng = np.random.RandomState(1234) vt = VectorThing(rng, vec_size, data) minibatch_size = 10 parameters = [] input_vectors = [] input = T.dmatrix() ref = T.dmatrix() cls = Classifier(rng, window_size, vec_size, input, ref) #functions #cost, updates = cls.get_cost_and_updates(learning_rate=0.1) #train = theano.function([input, ref], cost, updates=updates) train = cls.get_training_function() #Make batches for sentence, cut them into parts of 10 or so, #train maximum of that amount at once for epoch in range(0,50): print epoch epoch_cost = [] for i, sentence in enumerate(data): #Create training material for this sentence sentence_refs = [] sentence_inputs = [] #print i if i%100 == 0 and i > 0: print np.mean(epoch_cost), i for win in window(sentence, n=window_size): try: ref_vector = wv.word_to_vector(win[-1]) w_vectors = [] for w in win[:-1]: w_vectors.append(wv.word_to_vector(w)) sentence_refs.append(ref_vector) sentence_inputs.append(np.concatenate(w_vectors)) except: pass#print ':(' batches_ref = [] for b_ref in chunks(sentence_refs, minibatch_size): batches_ref.append(b_ref) batches_input = [] for b_inp in chunks(sentence_inputs, minibatch_size): batches_input.append(b_inp) #insert paragraph vector cls.paragraph.set_value(vt.sentence_vecs[i].eval()) #before = vt.sentence_vecs[i].eval() #Train them for rf, inpt in zip(batches_ref, batches_input): #import pdb;pdb.set_trace() batch_cost = train(inpt, rf) epoch_cost.append(batch_cost) #print batch_cost, len(sentence_refs) #recover the new vector vt.sentence_vecs[i].set_value(cls.paragraph.eval()) #after = vt.sentence_vecs[i].eval() print 'mean_cost', np.mean(epoch_cost) save_model(vt, 'model_epoch_' + str(epoch)) ####AGAIN! Now with only paragraph vectors##### cls.learning_rate = 0.1 train_p = cls.get_training_function_only_paragraph() for epoch in range(0,100): print epoch epoch_cost = [] for i, sentence in enumerate(data): #Create training material for this sentence sentence_refs = [] sentence_inputs = [] #print i if i%100 == 0 and i > 0: print np.mean(epoch_cost), i for win in window(sentence, n=window_size): try: ref_vector = wv.word_to_vector(win[-1]) w_vectors = [] for w in win[:-1]: w_vectors.append(wv.word_to_vector(w)) sentence_refs.append(ref_vector) sentence_inputs.append(np.concatenate(w_vectors)) except: pass#print ':(' batches_ref = [] for b_ref in chunks(sentence_refs, minibatch_size): batches_ref.append(b_ref) batches_input = [] for b_inp in chunks(sentence_inputs, minibatch_size): batches_input.append(b_inp) #insert paragraph vector cls.paragraph.set_value(vt.sentence_vecs[i].eval()) #before = vt.sentence_vecs[i].eval() #Train them for rf, inpt in zip(batches_ref, batches_input): #import pdb;pdb.set_trace() batch_cost = train(inpt, rf) epoch_cost.append(batch_cost) #print batch_cost, len(sentence_refs) #recover the new vector vt.sentence_vecs[i].set_value(cls.paragraph.eval()) #after = vt.sentence_vecs[i].eval() print 'mean_cost', np.mean(epoch_cost) save_model(vt, 'model_epoch_' + str(epoch))
from tools import utilities ''' Read all the word vectors and normalize them ''' def read_word_vectors(filename): word_vecs = {} if filename.endswith('.gz'): file_object = gzip.open(filename, 'r') else: file_object = open(filename, 'r') for line_num, line in enumerate(file_object): line = line.strip().lower() word = line.split()[0] word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float) for index, vec_val in enumerate(line.split()[1:]): word_vecs[word][index] = float(vec_val) ''' normalize weight vector ''' word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6) sys.stderr.write("Vectors read from: "+filename+" \n") return word_vecs if __name__ == "__main__": inputw2vFile = sys.argv[1] outputFile = sys.argv[2] util = utilities() w2v1 = wv.load(inputw2vFile) # orginal w2v w2v1= dict(w2v1) util.write_word_vecs(w2v1, outputFile) # output file
def main(): ''' made_up_data = [] for i in range(100): made_up_data.append([random.randint(1,6) / 10.0 for y in range(3)] + [random.randint(6,9) / 10.0 for y in range(3)]) #Let's make a simple ae split_data = numpy.array(made_up_data)[:,:3], numpy.array(made_up_data)[:,3:] #Let's test the concatenation layer thing! ''' #Config stuff n_in_vecs = 2 vec_size = 200 hidden_size = 200 minibatch_size = 1000 #Let us load the data inf = open('dataset_xm.list','rb') examples, vocab, vecs = pickle.load(inf)#)vector_grabber.get_data() inf.close() #examples, vocab, vecs = vector_grabber_mini.get_data() print len(examples) #import pdb;pdb.set_trace() #out = open('the_clean_data_mini','wb') #pickle.dump((examples, vocab, vecs), out) #out.close() #import pdb;pdb.set_trace() #vec table made! #import pdb;pdb.set_trace() n_vecs = [] for v in vecs: if len(v) < 300: n_vecs.append(v) else: n_vecs.append(numpy.zeros(200)) vec_table = theano.shared(value=numpy.array(n_vecs), name='W', borrow=True) n_examples = [] for e in examples: if e[-1].startswith('X'): n_examples.append(e[:-1]) eval_cong = [] eval_incong = [] for e in examples: if e[-1].startswith('c'): eval_cong.append(e[:-1]) elif e[-1].startswith('i'): eval_incong.append(e[:-1]) #minibatches minibatches = [] for i in range(0,len(n_examples), minibatch_size): minibatches.append(numpy.array(n_examples[i:i+minibatch_size])) #Let's create input variables: input_variables = [T.lvector() for i in range(n_in_vecs)] inputs = [vec_table[input_variables[0]], vec_table[input_variables[1]]] conc_layer = concatenation_layer(inputs) o_input = T.lvector() rng = numpy.random.RandomState(1234) #SV_ae sv_ae = simple_ae_layer(conc_layer.output, vec_size*n_in_vecs, vec_size*n_in_vecs, hidden_size, rng) sv_sp_layer = split_layer(sv_ae.output, n_in_vecs, vec_size) sv_res_f = theano.function(input_variables, sv_sp_layer.output) sv_cost = T.mean((conc_layer.output - sv_ae.output) ** 2) sv_cost_f = theano.function(input_variables, [sv_cost]) learning_rate = theano.shared(0.8) sv_gparams = [T.grad(sv_cost, param) for param in sv_ae.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(sv_ae.params, sv_gparams)] sv_train_f = theano.function(input_variables, sv_cost, updates=updates) #Mapping mapl = linear_mapping_layer(sv_ae.hidden_output, vec_size, hidden_size, rng) m_cost = T.mean((vec_table[o_input] - mapl.output) ** 2) #learning_rate = theano.shared(0.5) gparams = [T.grad(m_cost, param) for param in mapl.params] das_parameters = [mapl.W, sv_ae.W] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(das_parameters, gparams)] m_train_f = theano.function([input_variables[0], input_variables[1], o_input], m_cost, updates=updates) m_cost_f = theano.function([input_variables[0], input_variables[1], o_input], m_cost) # res_f = theano.function(inputs, mapl.output) vec_f = theano.function([o_input], vec_table[o_input]) gres_f = theano.function(input_variables, mapl.output) vres_f = theano.function(inputs, sv_ae.hidden_output) #Now let us do evaluation related stuffs! #Let us load up a model #model__latest_200_ae_map680 inf = open('./models_ae/model__latest_200_ae_map680','rb')#model_200_ae_map7880','rb')#'./models_ae/model_200_ae_map620','rb') params = pickle.load(inf) #pickle.dump([sv_ae.params, mapl.params], outf) for p,a in zip(params[0], sv_ae.params): a.set_value(p.get_value()) for p,a in zip(params[1], mapl.params): a.set_value(p.get_value()) inf.close() xc = len(eval_cong) c_results = gres_f(numpy.array(eval_cong).reshape(xc,3)[:,0], numpy.array(eval_cong).reshape(xc,3)[:,1]) xi = len(eval_incong) ic_results = gres_f(numpy.array(eval_incong).reshape(xi,3)[:,0], numpy.array(eval_incong).reshape(xi,3)[:,1]) c_objs = vec_f(numpy.array(eval_cong).reshape(xc,3)[:,2]) ic_objs = vec_f(numpy.array(eval_incong).reshape(xi,3)[:,2]) #Find the pairs the_pairs_c = [] the_pairs_ic = [] for trp in eval_cong: the_pairs_c.append(str(trp[0][0]) + '+' + str(trp[0][1])) for trp in eval_incong: the_pairs_ic.append(str(trp[0][0]) + '+' + str(trp[0][1])) pair_map = [] for trp in eval_cong: if str(trp[0][0]) + '+' + str(trp[0][1]) in the_pairs_c and str(trp[0][0]) + '+' + str(trp[0][1]) in the_pairs_ic: pair_map.append((the_pairs_c.index(str(trp[0][0]) + '+' + str(trp[0][1])), the_pairs_ic.index(str(trp[0][0]) + '+' + str(trp[0][1])))) tp = 0 fp = 0 tn = 0 fn = 0 count = 0 for tpl in pair_map: cd = cosine(c_objs[tpl[0]], c_results[tpl[0]]) icd = cosine(ic_objs[tpl[1]], ic_results[tpl[1]]) print cd, icd if cd < icd: count += 1 print count/float(len(pair_map)) #The fun loop #wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=100000) #import pdb;pdb.set_trace() wv = wvlib.load("/home/ginter/w2v/pb34_lemma_200_v2.bin",max_rank=50000).normalize() import pdb;pdb.set_trace() #The loop while True: try: xs = raw_input('Subject ') xv = raw_input('Verb ') xsv = wv.word_to_vector(xs.decode('utf8')) xvv = wv.word_to_vector(xv.decode('utf8')) if xs.startswith('exit') or xv.startswith('exit'): break for t in wv.approximate_nearest(res_f([xsv], [xvv])[0]):#wv.nearest(vres_f(numpy.array([xsv,]),numpy.array([xvv]))[0]): print t print except: pass '''