scale = -model.alpha #update params model.model.stack[1:] = [P+scale*dP for P,dP in zip(model.model.stack[1:],grad[1:])] #model.model.updateParams(scale,grad,log=False) # handle dictionary update sparsely dL = grad[0] for j in dL.iterkeys(): model.model.L[:,j] += scale*dL[j] model.costt.append(cost) if model.it%1 == 0: print "Iter %d : Cost=%.4f, ExpCost=%.4f."%(model.it,cost,model.expcost[-1]) start = time.time() with DeepDist(sgd,masterurl) as dd: print 'wait for server to come up' time.sleep(10) #epoch loop for e in range(opts.epochs): startepoch = time.time() print "Running epoch %d"%e m = len(trees) random.shuffle(trees) for i in xrange(0,240,sgd.minibatch): sgd.it += 1 mb_data = sc.parallelize(trees[i:i+sgd.minibatch]) dd.train(mb_data, gradient, descent) endepoch= time.time() print '******** time of iteration %f'%(endepoch-startepoch)
from deepdist import DeepDist from gensim.models.word2vec import Word2Vec from pyspark import SparkContext sc = SparkContext() corpus = sc.textFile('./data/text8').map(lambda s: s.split()) def gradient(model, sentences): # executes on workers syn0, syn1 = model.syn0.copy(), model.syn1.copy() # previous weights model.train(sentences) return {'syn0': model.syn0 - syn01, 'syn1': model.syn1 - syn1} def descent(model, update): # executes on master model.syn0 += update['syn0'] model.syn1 += update['syn1'] with DeepDist(Word2Vec(corpus.collect())) as dd: dd.train(corpus, gradient, descent) print dd.model.most_similar(positive=['woman', 'king'], negative=['man'])
from deepdist import DeepDist from gensim.models.word2vec import Word2Vec from pyspark import SparkContext corpus = sc.textFile('t8').repartition(40).map(lambda s: s.split()) #note we need to repartition before the map print "\n\npartitioning into %s partitions" % corpus._jrdd.splits().size() #_jrdd's partitions are lazily executed, meaning that we need to do an 'action (e.g. map, count' to activate it c1, c2 = corpus.randomSplit((0.03, 0.97)) #let's split the corpus into 2 for the moment def gradient(model, sentences): syn0, syn1 = model.syn0.copy(), model.syn1.copy() model.train(sentences) return {'syn0': model.syn0 - syn0, 'syn1': model.syn1 - syn1} def descent(model, update): print "on master: we just updated the weights" model.syn0 += update['syn0'] model.syn1 += update['syn1'] model = Word2Vec(c1.collect()) print model['night'] # this is the result w/ 3% of the training set dd = DeepDist(model, '52.32.19.84:5000') # replace w/ cluster ip dd.start_server() dd.train(c2, gradient, descent) print dd.model['night'] # i didn't do most_similar because I think gensim caches the result
sc, sqlContext = init_spark(verbose_logging='INFO', show_progress=False) sc.addPyFile('deepdist.py') sc.addPyFile('rwlock.py') xneg = RNG.multivariate_normal([-1.5,-1.5],NP.eye(2),size=50) xpos = RNG.multivariate_normal([1.5,1.5],NP.eye(2),size=50) x = NP.concatenate([xneg, xpos]) y = NP.array([-1] * 50 + [1] * 50) dataset = sc.parallelize(zip(x, y)) w = RNG.uniform(-1, 1, 2) b = RNG.uniform(-1, 1) model = {'w': w, 'b': b} def grad(model, data): dataX = NP.array(data.map(lambda r: r[0]).collect()) dataY = NP.array(data.map(lambda r: r[1]).collect()) pred = NP.dot(dataX, model['w']) + model['b'] gw = NP.dot((pred - dataY), dataX) / 100 gb = (pred - dataY).sum() / 100 return {'w': gw, 'b': gb} def desc(model, update): model['w'] -= 0.01 * update['w'] model['b'] -= 0.01 * update['b'] print model with DeepDist(model, master=None) as dd: dd.train(dataset, grad, desc) print model
del model.syn0norm tt = time.time() for row in model.accuracy('questions-words.txt'): if row['section'] != 'total': continue print >> log, ( ' %i %.1f%% v%i %.1f %.1f' % (row['correct'], 100.0 * row['correct'] / (row['incorrect'] + row['correct']), model.version, 1.0 * row['correct'] / model.version, time.time() - t)) t += (time.time() - tt) last_time = time.time() print 'Train model...' with DeepDist(model, min_updates=8) as dd: while True: dd.train(corpus, gradient, descent) print 'Saving model to "model.bin"...' model.save_word2vec_format('model.bin', binary=True) print 'Evaluate model...' del model.syn0norm for row in model.accuracy('questions-words.txt'): if row['section'] != 'total': continue print(' %i %.1f%% v%i %.1f' % (row['correct'], 100.0 * row['correct'] / (row['incorrect'] + row['correct']), model.version,
from pyspark import SparkContext corpus = sc.textFile('t8').repartition(40).map( lambda s: s.split()) #note we need to repartition before the map print "\n\npartitioning into %s partitions" % corpus._jrdd.splits().size( ) #_jrdd's partitions are lazily executed, meaning that we need to do an 'action (e.g. map, count' to activate it c1, c2 = corpus.randomSplit( (0.03, 0.97)) #let's split the corpus into 2 for the moment def gradient(model, sentences): syn0, syn1 = model.syn0.copy(), model.syn1.copy() model.train(sentences) return {'syn0': model.syn0 - syn0, 'syn1': model.syn1 - syn1} def descent(model, update): print "on master: we just updated the weights" model.syn0 += update['syn0'] model.syn1 += update['syn1'] model = Word2Vec(c1.collect()) print model['night'] # this is the result w/ 3% of the training set dd = DeepDist(model, '52.32.19.84:5000') # replace w/ cluster ip dd.start_server() dd.train(c2, gradient, descent) print dd.model[ 'night'] # i didn't do most_similar because I think gensim caches the result
from deepdist import DeepDist from gensim.models.word2vec import Word2Vec from pyspark import SparkContext sc = SparkContext() corpus = sc.textFile('enwiki').map(lambda s: s.split()) def gradient(model, sentences): # executes on workers syn0, syn1 = model.syn0.copy(), model.syn1.copy() model.train(sentences) return {'syn0': model.syn0 - syn0, 'syn1': model.syn1 - syn1} def descent(model, update): # executes on master model.syn0 += update['syn0'] model.syn1 += update['syn1'] model = Word2Vec(corpus.collect()) dd = DeepDist(model, '52.32.19.84') dd.start_server() dd.train(corpus, gradient, descent) # print dd.model.most_similar(positive=['woman', 'king'], negative=['man'])
sc = SparkContext() corpus =sc.textFile('hdfs:///user/hadoop/data/tagged_docs_shuffled.txt').map( lambda s: TaggedDocument(s[19:].split(), [s[:18]])) def gradient(model, taggedData): syn0, syn1, doctag = model.wv.syn0.copy(), model.syn1neg.copy(), model.docvecs.vectors_docs.copy() model.train(taggedData,total_examples=model.corpus_count,epochs=3) return {'syn0': model.wv.syn0 - syn0,'syn1':model.syn1neg-syn1,'doctag':model.docvecs.vectors_docs-doctag} def descent(model, update): model.wv.syn0 += update['syn0'] model.syn1neg += update['syn1'] model.docvecs.vectors_docs += update['doctag'] collectedCorpus = corpus.collect() doc2VecModel = Doc2Vec(collectedCorpus) with DeepDist(doc2VecModel,master='192.168.91.1:5000') as dd: for i in range(3): dd.train(corpus,gradient,descent) train_arrays = numpy.zeros((25000, 100)) train_labels = numpy.zeros(25000) for i in range(12500): prefix_train_pos = 'TRAIN_POS' + "%09d"%(i+1,) prefix_train_neg = 'TRAIN_NEG' + "%09d"%(i+1,) train_arrays[i] = dd.model[prefix_train_pos] train_arrays[12500 + i] =dd.model[prefix_train_neg] train_labels[i] = 1 train_labels[12500 + i] = 0 test_arrays = numpy.zeros((25000, 100)) test_labels = numpy.zeros(25000)