Ejemplo n.º 1
0
from deepdist import DeepDist
from gensim.models.word2vec import Word2Vec
from pyspark import SparkContext

sc = SparkContext()
corpus = sc.textFile('./data/text8').map(lambda s: s.split())


def gradient(model, sentences):  # executes on workers
    syn0, syn1 = model.syn0.copy(), model.syn1.copy()  # previous weights
    model.train(sentences)
    return {'syn0': model.syn0 - syn01, 'syn1': model.syn1 - syn1}


def descent(model, update):  # executes on master
    model.syn0 += update['syn0']
    model.syn1 += update['syn1']


with DeepDist(Word2Vec(corpus.collect())) as dd:
    dd.train(corpus, gradient, descent)
    print dd.model.most_similar(positive=['woman', 'king'], negative=['man'])
Ejemplo n.º 2
0
                scale = -model.alpha

        #update params
        model.model.stack[1:] = [P+scale*dP for P,dP in zip(model.model.stack[1:],grad[1:])]
        #model.model.updateParams(scale,grad,log=False)
        # handle dictionary update sparsely
        dL = grad[0]
        for j in dL.iterkeys():
            model.model.L[:,j] += scale*dL[j]

        model.costt.append(cost)
        if model.it%1 == 0:
            print "Iter %d : Cost=%.4f, ExpCost=%.4f."%(model.it,cost,model.expcost[-1])

start = time.time()
with DeepDist(sgd,masterurl) as dd:
    print 'wait for server to come up'
    time.sleep(10)
    #epoch loop
    for e in range(opts.epochs):
        startepoch = time.time()
        print "Running epoch %d"%e
        m = len(trees)
        random.shuffle(trees)
        for i in xrange(0,240,sgd.minibatch):
            sgd.it += 1
            mb_data = sc.parallelize(trees[i:i+sgd.minibatch])
            dd.train(mb_data, gradient, descent)
        endepoch= time.time()
        print '******** time of iteration %f'%(endepoch-startepoch)
Ejemplo n.º 3
0
sc, sqlContext = init_spark(verbose_logging='INFO', show_progress=False)
sc.addPyFile('deepdist.py')
sc.addPyFile('rwlock.py')

xneg = RNG.multivariate_normal([-1.5,-1.5],NP.eye(2),size=50)
xpos = RNG.multivariate_normal([1.5,1.5],NP.eye(2),size=50)
x = NP.concatenate([xneg, xpos])
y = NP.array([-1] * 50 + [1] * 50)
dataset = sc.parallelize(zip(x, y))

w = RNG.uniform(-1, 1, 2)
b = RNG.uniform(-1, 1)
model = {'w': w, 'b': b}

def grad(model, data):
    dataX = NP.array(data.map(lambda r: r[0]).collect())
    dataY = NP.array(data.map(lambda r: r[1]).collect())
    pred = NP.dot(dataX, model['w']) + model['b']
    gw = NP.dot((pred - dataY), dataX) / 100
    gb = (pred - dataY).sum() / 100
    return {'w': gw, 'b': gb}

def desc(model, update):
    model['w'] -= 0.01 * update['w']
    model['b'] -= 0.01 * update['b']

print model
with DeepDist(model, master=None) as dd:
    dd.train(dataset, grad, desc)
    print model
Ejemplo n.º 4
0
        del model.syn0norm
        tt = time.time()
        for row in model.accuracy('questions-words.txt'):
            if row['section'] != 'total':
                continue
            print >> log, (
                ' %i %.1f%% v%i %.1f %.1f' %
                (row['correct'], 100.0 * row['correct'] /
                 (row['incorrect'] + row['correct']), model.version,
                 1.0 * row['correct'] / model.version, time.time() - t))
        t += (time.time() - tt)
        last_time = time.time()


print 'Train model...'
with DeepDist(model, min_updates=8) as dd:

    while True:
        dd.train(corpus, gradient, descent)

        print 'Saving model to "model.bin"...'
        model.save_word2vec_format('model.bin', binary=True)

print 'Evaluate model...'
del model.syn0norm
for row in model.accuracy('questions-words.txt'):
    if row['section'] != 'total':
        continue
    print(' %i %.1f%% v%i %.1f' %
          (row['correct'], 100.0 * row['correct'] /
           (row['incorrect'] + row['correct']), model.version,
from pyspark import SparkContext

corpus = sc.textFile('t8').repartition(40).map(
    lambda s: s.split())  #note we need to repartition before the map
print "\n\npartitioning into %s partitions" % corpus._jrdd.splits().size(
)  #_jrdd's partitions are lazily executed, meaning that we need to do an 'action (e.g. map, count' to activate it

c1, c2 = corpus.randomSplit(
    (0.03, 0.97))  #let's split the corpus into 2 for the moment


def gradient(model, sentences):
    syn0, syn1 = model.syn0.copy(), model.syn1.copy()
    model.train(sentences)
    return {'syn0': model.syn0 - syn0, 'syn1': model.syn1 - syn1}


def descent(model, update):
    print "on master: we just updated the weights"
    model.syn0 += update['syn0']
    model.syn1 += update['syn1']


model = Word2Vec(c1.collect())
print model['night']  # this is the result w/ 3% of the training set
dd = DeepDist(model, '52.32.19.84:5000')  # replace w/ cluster ip
dd.start_server()
dd.train(c2, gradient, descent)
print dd.model[
    'night']  # i didn't do most_similar because I think gensim caches the result
from deepdist import DeepDist
from gensim.models.word2vec import Word2Vec
from pyspark import SparkContext

sc = SparkContext()
corpus = sc.textFile('enwiki').map(lambda s: s.split())


def gradient(model, sentences):  # executes on workers
    syn0, syn1 = model.syn0.copy(), model.syn1.copy()
    model.train(sentences)
    return {'syn0': model.syn0 - syn0, 'syn1': model.syn1 - syn1}


def descent(model, update):  # executes on master
    model.syn0 += update['syn0']
    model.syn1 += update['syn1']


model = Word2Vec(corpus.collect())
dd = DeepDist(model, '52.32.19.84')
dd.start_server()
dd.train(corpus, gradient, descent)
# print dd.model.most_similar(positive=['woman', 'king'], negative=['man'])
Ejemplo n.º 7
0
sc = SparkContext()
corpus =sc.textFile('hdfs:///user/hadoop/data/tagged_docs_shuffled.txt').map(
    lambda s: TaggedDocument(s[19:].split(), [s[:18]]))

def gradient(model, taggedData):
    syn0, syn1, doctag = model.wv.syn0.copy(), model.syn1neg.copy(), model.docvecs.vectors_docs.copy()
    model.train(taggedData,total_examples=model.corpus_count,epochs=3)
    return {'syn0': model.wv.syn0 - syn0,'syn1':model.syn1neg-syn1,'doctag':model.docvecs.vectors_docs-doctag}

def descent(model, update):
    model.wv.syn0 += update['syn0']
    model.syn1neg += update['syn1']
    model.docvecs.vectors_docs += update['doctag']
collectedCorpus = corpus.collect()
doc2VecModel = Doc2Vec(collectedCorpus)
with DeepDist(doc2VecModel,master='192.168.91.1:5000') as dd:
    for i in range(3):
        dd.train(corpus,gradient,descent)


train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)
for i in range(12500):
    prefix_train_pos = 'TRAIN_POS' + "%09d"%(i+1,)
    prefix_train_neg = 'TRAIN_NEG' + "%09d"%(i+1,)
    train_arrays[i] = dd.model[prefix_train_pos]
    train_arrays[12500 + i] =dd.model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)