Beispiel #1
0
def train_worker(vec_size, window_size, k,  alpha, queue, results_queue, sent_dic, sent_vecs, vocab_dic, vocab_vecs, table, win_count_dic, lock ):
    # change shared Array to numpy array
    sent_vecs = Arr(np.frombuffer(sent_vecs.get_obj()), vec_size)
    vocab_vecs = Arr(np.frombuffer(vocab_vecs.get_obj()), vec_size)
    # init objects
    sent = Sent(vec_size, sent_dic, sent_vecs)
    vocab = Vocab(vec_size, vocab_dic, vocab_vecs)
    window_table = WindowTable(vocab=vocab, 
            size=window_size, 
            table=table, 
            win_count_dic=win_count_dic)
    # get a task
    sentence = queue.get()
    while sentence != None:
        if sentence == CURRENT_TURN_END_TOKEN:
            results_queue.put(None)
            sentence = queue.get()
            continue
        Jn = 0
        windows = gen_windows_from_sentence(sentence, window_size)
        v = sent[sentence]
        for wn, window in enumerate(windows):
            window_key = "-".join([str(vocab.vocab[hash(w)]) for w in window])
            h = vocab.get_window_vec(word_index=window_key)
            # noises
            noises = window_table.get_samples(k)
            e_vT_h = np.e**np.dot(v.T, h)
            update_v = h / (1. + e_vT_h)
            update_h = v / (1. + e_vT_h)
            # add positive window's loss
            Jn += math.log( 1. / ( 1. + 1./e_vT_h))
            update_window(vocab, window_key, update_h, lock, alpha)
            for idx, key in noises:
                n_h = vocab.get_window_vec(word_index=key)
                e_vT_h = np.e ** np.dot(v, n_h)
                frac_e_v_h = 1 - \
                        1 / (1 + e_vT_h)
                # accumulate the gradient
                update_v += - n_h * frac_e_v_h
                update_n_h = - v * frac_e_v_h
                update_window(vocab, key, update_n_h, lock, alpha)
                # add noise's loss
                Jn += math.log( 1/ (1+e_vT_h))

            update_v /= ( 1 + k)
            update_sent_vec(v, update_v, lock, alpha)
        #return Jn
        results_queue.put(Jn)
        current = mp.current_process()
        #print "%s Jn: %f" % (current.name, Jn)
        sentence = queue.get()
        show_status(results_queue)
    print "process %s exit!" % current.name
    logging.warning("process %s exit!" % current.name)
Beispiel #2
0
class Sent2Vec(object):
    def __init__(self, path="", vec_size=50, k=20, alpha=0.1, n_workers=1):
        '''
        :parameters:

            @path: string
                path to dataset, should be a single file

            @vec_size: int
                size of sentence vector and word vector

            @k: int 
                number of negative samples for a window

            @alpha: float
                learning rate
        '''
        self.k = k
        self.vec_size = vec_size
        self.n_workers = n_workers
        self.alpha = alpha
        self.vocab = Vocab()
        self.sent = Sent()
        self.window_table = WindowTable(self.vocab, SIZE)
        self.dataset = Dataset(path)

        if path:
            self.create_vocab()
            self.create_sent()
            self.create_window_table()

    def create_vocab(self):
        for sent in self.dataset.sents:
            sent = sent.split()
            self.vocab.add_from_sent(sent)
        self.vocab.init_vecs()

    def create_sent(self):
        for sent in self.dataset.sents:
            self.sent.add(sent)
        self.sent.init_vecs()

    def create_window_table(self):
        '''
        for negative sampling
        '''
        self.window_table(self.dataset.sents)

    
    def multi_thread_train(self):
        '''
        use mini-batch to train
        '''
        jobs = Queue(maxsize=9 * self.n_workers)
        lock = threading.Lock()

        start, next_report = time.time(), [1.0]

        self.Js = []

        def worker_train():
            while True:
                # get sentence
                sent = jobs.get()
                if sent is None:
                    break

                Jn = self.train_sent(sent, lock)
                self.Js.append(Jn)

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.n_workers)]

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()
        # put dataset to Queue
        for sent in self.dataset.sents:
            jobs.put(sent)
        # put None to tell all threads to exit
        for _ in xrange(self.n_workers):
            jobs.put(None)  

        for thread in workers:
            thread.join()
        print 'Js: ', np.mean(self.Js)
        elapsed = time.time() - start
        print 'used time', elapsed

    def train(self):
        '''
        use mini-batch to train
        '''
        Js = []
        for no, sent in enumerate(self.dataset.sents):
            Jn = self.train_sent(sent)
            Js.append(Jn)
        # calculate Jn for this sentence
        mean_Js = np.mean( np.array(Js))
        print 'total J', mean_Js
        return mean_Js

    def train_sent(self, sent, lock=None):
        # the loss
        Jn = 0
        #print no, 
        #print 'training sent: ', no, sent
        # get windows from the sent
        windows = gen_windows_from_sentence(sent, SIZE)
        #print 'gen windows', windows
        # get sentence vector
        v = self.sent[sent]
        
        for wn, window in enumerate(windows):
            #print '.', 
            #assert( type(window) == type([]), "window is %s" % str(window))
            #print 'window', window
            window_key = "-".join([str(self.vocab.vocab[hash(w)]) for w in window])
            h = self.vocab.get_window_vec(word_index=window_key)
            # noises
            noises = self.window_table.get_samples(self.k)
            #n_hs = [self.vocab.get_window_vec(s[1]) for s in noises ]
            # for a positive sample
            #print 'h:', h
            #print 'v:', v
            e_vT_h = np.e**np.dot(v.T, h)
            #print "dot(v,h)", np.dot(v, h)
            #print "e_vT_h", e_vT_h
            #sys.exit(0);
            update_v = h / (1 + e_vT_h)
            update_h = v / (1 + e_vT_h)
            # add positive window's loss
            Jn += math.log( 1 / ( 1 + 1/e_vT_h))

            self.update_window(window_key, update_h, lock)
            # for each negative window sample
            for idx, key in noises:
                n_h = self.vocab.get_window_vec(word_index=key)
                e_vT_h = np.e ** np.dot(v, n_h)
                frac_e_v_h = 1 - \
                        1 / (1 + e_vT_h)
                # accumulate the gradient
                update_v += - n_h * frac_e_v_h
                update_n_h = - v * frac_e_v_h
                self.update_window(key, update_n_h, lock)
                # add noise's loss
                Jn += math.log( 1/ (1+e_vT_h))

            update_v /= ( 1 + self.k)
            # update sentence vector for each window
            # TODO change to a single turn?
            self.update_sent_vec(v, update_v, lock)
            # add loss to total Jn
        #print 
        return Jn

    def update_sent_vec(self, sent_vec, grad, lock=None):
        if lock:
            with lock:
                sent_vec += self.alpha * grad
                sent_vec /= LA.norm(sent_vec)
        else:
            sent_vec += self.alpha * grad
            sent_vec /= LA.norm(sent_vec)

    def update_window(self, key, grad, lock=None):
        '''
        update each word's vector in a window
            and norm the vectors

        :parameters:
            @key: string
                like '19-32-2'
            @grad: numpy.array
                the gradient
        '''
        word_ids = [int(id) for id in key.split('-')]
        for id in word_ids:
            word_vec = self.vocab.vecs[id]
            if lock:
                with lock:
                    word_vec += self.alpha * grad
                    word_vec /= LA.norm(word_vec)
            else:
                word_vec += self.alpha * grad
                word_vec /= LA.norm(word_vec)


    def tofile(self, path):
        '''
        save model to file
        '''
        mod2file(self, path)

    @staticmethod
    def fromfile(path):
        return mod_from_file(path)