def train(self, scored_word_sentences,
              learn_doctags=True, learn_words=True, learn_hidden=True,iter=None,
              batch_size=128 #128, #512 #256
              ,sub_batch_size=128 #16 #32 #128 #128  #256 #128 #512 #256 #1
              #total_words=None, word_count=0,
              #chunksize=800,
              #total_examples=None, queue_factor=2, report_delay=1
              ):
        train_prepossess(self)
        vocab_size=len(self.vocab) 
        #batch_size=800 ##optimized 1G mem video card
        #batch_size=chunksize
        samples_per_epoch=int(self.window*2*sum(map(len,scored_word_sentences)))
        #print 'samples_per_epoch',samples_per_epoch
        if self.sg:
            #print 'sg',self.keras_context_index_size,sub_batch_size
            self.kerasmodel  =build_keras_model_score_word_sg(index_size=vocab_size,
                                                              vector_size=self.vector_size,
                                                              #vocab_size=vocab_size,
                                                              #code_dim=vocab_size,
                                                              context_size=self.keras_context_index_size,
                                                              score_vector_size=self.score_vector_size,
                                                              sub_batch_size=sub_batch_size,
                                                              model=self,
                                                              word_vectors=self.syn0,
                                                              hidden_vectors=self.keras_syn1,
                                                              )

            gen=train_batch_score_sg(self, scored_word_sentences, #self.alpha, work=None,
                                     score_vector_size=self.score_vector_size,
                                     sub_batch_size=sub_batch_size,
                                     batch_size=batch_size)
            self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0)
        else:
            self.kerasmodel=build_keras_model_score_word_cbow(index_size=vocab_size,vector_size=self.vector_size,
                                                              # vocab_size=vocab_size,
                                                              # code_dim=vocab_size,
                                                              context_size=self.keras_context_index_size,
                                                              score_vector_size=self.score_vector_size,
                                                              sub_batch_size=1,#sub_batch_size,
                                                              model=self,
                                                              cbow_mean=self.cbow_mean,
                                                              word_vectors=self.syn0,
                                                              hidden_vectors=self.keras_syn1,
                                                              )

            #wv0=copy.copy(self.kerasmodel.nodes['embedding'].get_weights()[0][0])
            gen=train_batch_score_cbow(self, scored_word_sentences, self.alpha, work=None,batch_size=batch_size)
            self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0)

        self.syn0=self.kerasmodel.nodes['embedding'].get_weights()[0]
        if self.negative>0 and self.hs :
            syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
            self.syn1=syn1tmp[0:len(self.vocab)]
            self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)]
        elif self.hs:
            self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
        else:
            self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
Esempio n. 2
0
    def train(self, docs=None,
              #batch_size=800,
              learn_doctags=True, learn_words=True, learn_hidden=True,iter=None,
              batch_size=128 #128, #512 #256
              ,sub_batch_size=128 #16 #32 #128 #128  #256 #128 #512 #256 #1
              ):
        
        if iter!=None:
            self.iter=iter
        if docs==None:
            docs=self.docvecs
            
        train_prepossess(self)
        
        # if self.negative>0 and self.hs :
        #     self.keras_context_negative_base_index=len(self.vocab)
        #     self.keras_context_index_size=len(self.vocab)*2
        #     self.keras_syn1=np.vstack((self.syn1,self.syn1neg))
        # else:
        #     self.keras_context_negative_base_index=0
        #     self.keras_context_index_size=len(self.vocab)
        #     if self.hs :
        #         self.keras_syn1=self.syn1
        #     else:
        #         self.keras_syn1=self.syn1neg

        # self.neg_labels = []
        # if self.negative > 0:
        #     # precompute negative labels optimization for pure-python training
        #     self.neg_labels = np.zeros(self.negative + 1,dtype='int8')
        #     self.neg_labels[0] = 1
        

        # word_context_size_max=0
        # if self.hs :
        #     word_context_size_max += max(len(self.vocab[w].point) for w in self.vocab if hasattr(self.vocab[w],'point'))
        # if self.negative > 0:
        #     word_context_size_max += self.negative + 1

        vocab_size=len(self.vocab)
        index_size=len(self.docvecs)

            
        self.batch_size=batch_size
        batch_size=batch_size
        if self.sg:
            samples_per_epoch=max(1,int((self.word_context_size_max*self.window*2*sum(map(len,docs)))/(sub_batch_size)))
            self.kerasmodel=build_keras_model_dbow(index_size=index_size,vector_size=self.vector_size,
                                                   context_size=self.keras_context_index_size,
                                                   model=self,
                                                   learn_doctags=learn_doctags,
                                                   learn_hidden=learn_hidden,
                                                   hidden_vectors=self.keras_syn1,
                                                   doctag_vectors=self.docvecs.doctag_syn0,
                                                   sub_batch_size=sub_batch_size
                                                   )
            gen=train_batch_dbow(self, docs, sub_batch_size=sub_batch_size,batch_size=batch_size)
            self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0)
        else:
            if self.dm_concat:
                samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs)))
                self.kerasmodel=build_keras_model_dm_concat(index_size,self.vector_size,vocab_size,
                                                            context_size=self.keras_context_index_size,
                                                            window_size=self.window,
                                                            model=self,
                                                            learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden,
                                                            word_vectors=self.syn0,
                                                            hidden_vectors=self.keras_syn1,
                                                            doctag_vectors=self.docvecs.doctag_syn0
                                                            )
                gen= train_document_dm_concat(self, docs, batch_size=batch_size)
                self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0)
                self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0]
            else:
                samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs)))
                self.kerasmodel=build_keras_model_dm(index_size,self.vector_size,vocab_size,
                                                     self.keras_context_index_size,
                                                     maxwords=self.window*2+1,
                                                     model=self,
                                                     learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden,
                                                     word_vectors=self.syn0,
                                                     doctag_vectors=self.docvecs.doctag_syn0,
                                                     hidden_vectors=self.keras_syn1,
                                                     cbow_mean=self.cbow_mean
                                                     )

                gen=train_batch_dm(self, docs, batch_size=batch_size)
                self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0)
                self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0]
        self.docvecs.doctag_syn0=self.kerasmodel.nodes['embedindex'].get_weights()[0]
        if self.negative>0 and self.hs :
            syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
            self.syn1=syn1tmp[0:len(self.vocab)]
            self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)]
        elif self.hs:
            self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
        else:
            self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0]
    def train(
        self,
        scored_word_sentences,
        learn_doctags=True,
        learn_words=True,
        learn_hidden=True,
        iter=None,
        batch_size=128  #128, #512 #256
        ,
        sub_batch_size=128  #16 #32 #128 #128  #256 #128 #512 #256 #1
        #total_words=None, word_count=0,
        #chunksize=800,
        #total_examples=None, queue_factor=2, report_delay=1
    ):
        train_prepossess(self)
        vocab_size = len(self.vocab)
        #batch_size=800 ##optimized 1G mem video card
        #batch_size=chunksize
        samples_per_epoch = int(self.window * 2 *
                                sum(map(len, scored_word_sentences)))
        #print 'samples_per_epoch',samples_per_epoch
        if self.sg:
            #print 'sg',self.keras_context_index_size,sub_batch_size
            self.kerasmodel = build_keras_model_score_word_sg(
                index_size=vocab_size,
                vector_size=self.vector_size,
                #vocab_size=vocab_size,
                #code_dim=vocab_size,
                context_size=self.keras_context_index_size,
                score_vector_size=self.score_vector_size,
                sub_batch_size=sub_batch_size,
                model=self,
                word_vectors=self.syn0,
                hidden_vectors=self.keras_syn1,
            )

            gen = train_batch_score_sg(
                self,
                scored_word_sentences,  #self.alpha, work=None,
                score_vector_size=self.score_vector_size,
                sub_batch_size=sub_batch_size,
                batch_size=batch_size)
            self.kerasmodel.fit_generator(gen,
                                          samples_per_epoch=samples_per_epoch,
                                          nb_epoch=self.iter,
                                          verbose=0)
        else:
            self.kerasmodel = build_keras_model_score_word_cbow(
                index_size=vocab_size,
                vector_size=self.vector_size,
                # vocab_size=vocab_size,
                # code_dim=vocab_size,
                context_size=self.keras_context_index_size,
                score_vector_size=self.score_vector_size,
                sub_batch_size=1,  #sub_batch_size,
                model=self,
                cbow_mean=self.cbow_mean,
                word_vectors=self.syn0,
                hidden_vectors=self.keras_syn1,
            )

            #wv0=copy.copy(self.kerasmodel.nodes['embedding'].get_weights()[0][0])
            gen = train_batch_score_cbow(self,
                                         scored_word_sentences,
                                         self.alpha,
                                         work=None,
                                         batch_size=batch_size)
            self.kerasmodel.fit_generator(gen,
                                          samples_per_epoch=samples_per_epoch,
                                          nb_epoch=self.iter,
                                          verbose=0)

        self.syn0 = self.kerasmodel.nodes['embedding'].get_weights()[0]
        if self.negative > 0 and self.hs:
            syn1tmp = self.kerasmodel.nodes['embedpoint'].get_weights()[0]
            self.syn1 = syn1tmp[0:len(self.vocab)]
            self.syn1neg = syn1tmp[len(self.vocab):2 * len(self.vocab)]
        elif self.hs:
            self.syn1 = self.kerasmodel.nodes['embedpoint'].get_weights()[0]
        else:
            self.syn1neg = self.kerasmodel.nodes['embedpoint'].get_weights()[0]