def train(self, scored_word_sentences, learn_doctags=True, learn_words=True, learn_hidden=True,iter=None, batch_size=128 #128, #512 #256 ,sub_batch_size=128 #16 #32 #128 #128 #256 #128 #512 #256 #1 #total_words=None, word_count=0, #chunksize=800, #total_examples=None, queue_factor=2, report_delay=1 ): train_prepossess(self) vocab_size=len(self.vocab) #batch_size=800 ##optimized 1G mem video card #batch_size=chunksize samples_per_epoch=int(self.window*2*sum(map(len,scored_word_sentences))) #print 'samples_per_epoch',samples_per_epoch if self.sg: #print 'sg',self.keras_context_index_size,sub_batch_size self.kerasmodel =build_keras_model_score_word_sg(index_size=vocab_size, vector_size=self.vector_size, #vocab_size=vocab_size, #code_dim=vocab_size, context_size=self.keras_context_index_size, score_vector_size=self.score_vector_size, sub_batch_size=sub_batch_size, model=self, word_vectors=self.syn0, hidden_vectors=self.keras_syn1, ) gen=train_batch_score_sg(self, scored_word_sentences, #self.alpha, work=None, score_vector_size=self.score_vector_size, sub_batch_size=sub_batch_size, batch_size=batch_size) self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) else: self.kerasmodel=build_keras_model_score_word_cbow(index_size=vocab_size,vector_size=self.vector_size, # vocab_size=vocab_size, # code_dim=vocab_size, context_size=self.keras_context_index_size, score_vector_size=self.score_vector_size, sub_batch_size=1,#sub_batch_size, model=self, cbow_mean=self.cbow_mean, word_vectors=self.syn0, hidden_vectors=self.keras_syn1, ) #wv0=copy.copy(self.kerasmodel.nodes['embedding'].get_weights()[0][0]) gen=train_batch_score_cbow(self, scored_word_sentences, self.alpha, work=None,batch_size=batch_size) self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) self.syn0=self.kerasmodel.nodes['embedding'].get_weights()[0] if self.negative>0 and self.hs : syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0] self.syn1=syn1tmp[0:len(self.vocab)] self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)] elif self.hs: self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0] else: self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
def train(self, docs=None, #batch_size=800, learn_doctags=True, learn_words=True, learn_hidden=True,iter=None, batch_size=128 #128, #512 #256 ,sub_batch_size=128 #16 #32 #128 #128 #256 #128 #512 #256 #1 ): if iter!=None: self.iter=iter if docs==None: docs=self.docvecs train_prepossess(self) # if self.negative>0 and self.hs : # self.keras_context_negative_base_index=len(self.vocab) # self.keras_context_index_size=len(self.vocab)*2 # self.keras_syn1=np.vstack((self.syn1,self.syn1neg)) # else: # self.keras_context_negative_base_index=0 # self.keras_context_index_size=len(self.vocab) # if self.hs : # self.keras_syn1=self.syn1 # else: # self.keras_syn1=self.syn1neg # self.neg_labels = [] # if self.negative > 0: # # precompute negative labels optimization for pure-python training # self.neg_labels = np.zeros(self.negative + 1,dtype='int8') # self.neg_labels[0] = 1 # word_context_size_max=0 # if self.hs : # word_context_size_max += max(len(self.vocab[w].point) for w in self.vocab if hasattr(self.vocab[w],'point')) # if self.negative > 0: # word_context_size_max += self.negative + 1 vocab_size=len(self.vocab) index_size=len(self.docvecs) self.batch_size=batch_size batch_size=batch_size if self.sg: samples_per_epoch=max(1,int((self.word_context_size_max*self.window*2*sum(map(len,docs)))/(sub_batch_size))) self.kerasmodel=build_keras_model_dbow(index_size=index_size,vector_size=self.vector_size, context_size=self.keras_context_index_size, model=self, learn_doctags=learn_doctags, learn_hidden=learn_hidden, hidden_vectors=self.keras_syn1, doctag_vectors=self.docvecs.doctag_syn0, sub_batch_size=sub_batch_size ) gen=train_batch_dbow(self, docs, sub_batch_size=sub_batch_size,batch_size=batch_size) self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) else: if self.dm_concat: samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs))) self.kerasmodel=build_keras_model_dm_concat(index_size,self.vector_size,vocab_size, context_size=self.keras_context_index_size, window_size=self.window, model=self, learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden, word_vectors=self.syn0, hidden_vectors=self.keras_syn1, doctag_vectors=self.docvecs.doctag_syn0 ) gen= train_document_dm_concat(self, docs, batch_size=batch_size) self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0) self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0] else: samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs))) self.kerasmodel=build_keras_model_dm(index_size,self.vector_size,vocab_size, self.keras_context_index_size, maxwords=self.window*2+1, model=self, learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden, word_vectors=self.syn0, doctag_vectors=self.docvecs.doctag_syn0, hidden_vectors=self.keras_syn1, cbow_mean=self.cbow_mean ) gen=train_batch_dm(self, docs, batch_size=batch_size) self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0] self.docvecs.doctag_syn0=self.kerasmodel.nodes['embedindex'].get_weights()[0] if self.negative>0 and self.hs : syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0] self.syn1=syn1tmp[0:len(self.vocab)] self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)] elif self.hs: self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0] else: self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
def train( self, scored_word_sentences, learn_doctags=True, learn_words=True, learn_hidden=True, iter=None, batch_size=128 #128, #512 #256 , sub_batch_size=128 #16 #32 #128 #128 #256 #128 #512 #256 #1 #total_words=None, word_count=0, #chunksize=800, #total_examples=None, queue_factor=2, report_delay=1 ): train_prepossess(self) vocab_size = len(self.vocab) #batch_size=800 ##optimized 1G mem video card #batch_size=chunksize samples_per_epoch = int(self.window * 2 * sum(map(len, scored_word_sentences))) #print 'samples_per_epoch',samples_per_epoch if self.sg: #print 'sg',self.keras_context_index_size,sub_batch_size self.kerasmodel = build_keras_model_score_word_sg( index_size=vocab_size, vector_size=self.vector_size, #vocab_size=vocab_size, #code_dim=vocab_size, context_size=self.keras_context_index_size, score_vector_size=self.score_vector_size, sub_batch_size=sub_batch_size, model=self, word_vectors=self.syn0, hidden_vectors=self.keras_syn1, ) gen = train_batch_score_sg( self, scored_word_sentences, #self.alpha, work=None, score_vector_size=self.score_vector_size, sub_batch_size=sub_batch_size, batch_size=batch_size) self.kerasmodel.fit_generator(gen, samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0) else: self.kerasmodel = build_keras_model_score_word_cbow( index_size=vocab_size, vector_size=self.vector_size, # vocab_size=vocab_size, # code_dim=vocab_size, context_size=self.keras_context_index_size, score_vector_size=self.score_vector_size, sub_batch_size=1, #sub_batch_size, model=self, cbow_mean=self.cbow_mean, word_vectors=self.syn0, hidden_vectors=self.keras_syn1, ) #wv0=copy.copy(self.kerasmodel.nodes['embedding'].get_weights()[0][0]) gen = train_batch_score_cbow(self, scored_word_sentences, self.alpha, work=None, batch_size=batch_size) self.kerasmodel.fit_generator(gen, samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0) self.syn0 = self.kerasmodel.nodes['embedding'].get_weights()[0] if self.negative > 0 and self.hs: syn1tmp = self.kerasmodel.nodes['embedpoint'].get_weights()[0] self.syn1 = syn1tmp[0:len(self.vocab)] self.syn1neg = syn1tmp[len(self.vocab):2 * len(self.vocab)] elif self.hs: self.syn1 = self.kerasmodel.nodes['embedpoint'].get_weights()[0] else: self.syn1neg = self.kerasmodel.nodes['embedpoint'].get_weights()[0]