with open(f + 'en') as ens, open(f + 'de') as des: for i, (en, de) in enumerate(islice(izip(ens, des), self.size)): en = ['%s_en'%w for w in preprocess(en).split()] de = ['%s_de'%w for w in preprocess(de).split()] langs = [en, de] shuffle(langs) l1, l2 = langs yield LabeledSentence(words=l1, labels=l2) print 'Learning to predict all l2 words from every l1 word' start = timeit.default_timer() f = sys.argv[1]+'/europarl-v7.de-en.' n = 50000 sentences = BitextWordLabelsSentence(f, n) print '%s sentences' % n model = Doc2Vec(dm=0, alpha=0.025, min_alpha=0.025, size=256) model.build_vocab(sentences) print '%s words in vocab' % len(model.vocab) print 'epochs' for epoch in range(10): model.train(sentences) print epoch model.alpha -= 0.002 # decrease the learning rate stop = timeit.default_timer() print 'Running time %ss' % (stop - start) inspect_words(model)
for w in model_de.vocab: model_de.syn0[model_de.vocab[w].index] /= (model_de.vocab[w].count + 1.0) if w in model: model.syn0[model.vocab[w].index] = model_de.syn0[ model_de.vocab[w].index] model.syn0norm = (model.syn0 / sqrt( (model.syn0**2).sum(-1))[..., newaxis]).astype(REAL) model_en.syn0norm = (model_en.syn0 / sqrt( (model_en.syn0**2).sum(-1))[..., newaxis]).astype(REAL) model_de.syn0norm = (model_de.syn0 / sqrt( (model_de.syn0**2).sum(-1))[..., newaxis]).astype(REAL) print 'Running time %ss' % (timeit.default_timer() - start) inspect_words(model) inspect_words(model_en) inspect_words(model_de) # Train two vocabularies print 'epochs' for epoch in range(10): sentences_en = (LabeledSentence(words=en, labels=[pen]) for pen, en, de in BitextTriples(f, n)) sentences_de = (LabeledSentence(words=de, labels=[pen]) for pen, en, de in BitextTriples(f, n)) model_en.train(sentences_en) model_de.train(sentences_de) print epoch model_en.alpha -= 0.001 # decrease the learning rate model_de.alpha -= 0.001 # decrease the learning rate