Esempio n. 1
0
 def get(self):
   start_date = self.request.get('start_date').split("-")
   end_date   = self.request.get('end_date').split("-")
   start_date.reverse()
   end_date.reverse()
   start      = dt.date(*(map(int,start_date)))
   end        = dt.date(*(map(int,end_date)))
   graph = facebook.GraphAPI(self.current_user['access_token'])
   extended_at = graph.extend_access_token(FACEBOOK_APP_ID, FACEBOOK_APP_SECRET)
   # utils.generateBdayTasks(start, end, {'at' : extended_at}) # Graph API doesnt allow posting to friends wall
   utils.generateTasks(start, end, {'at' : extended_at})
   self.response.out.write("Done!")
    def train(self,
              sentences,
              alpha=0.001,
              min_alpha=0.001,
              batches=1000,
              workers=4):
        print('Start training...')
        self.alpha = alpha
        self.min_alpha = min_alpha
        count = 0
        # barrier is used to sync parent and all workers
        barrier = utils.getBarrier(workers + 1)
        lock1 = Lock()
        lock2 = Lock()
        queue = Queue(workers)
        # delta_c_raw contains context weights for each position, they are shared, so each child process can
        # add their delta on them. delta_c is a numpy wrapper which makes the parent process handle it easily
        delta_c_raw = [
            utils.getSharedArray('f', self.dim * self.dim)
            for i in range(self.context)
        ]
        delta_c = [
            utils.toNumpyArray(delta_c_raw[i], np.float32,
                               (self.dim, self.dim))
            for i in range(self.context)
        ]
        delta_r_raw = utils.getSharedArray('f', len(self.vocab) * self.dim)
        delta_r = utils.toNumpyArray(delta_r_raw, np.float32,
                                     (len(self.vocab), self.dim))
        '''
        vocab: dictionary containing each word and its index, it's copied from the parent process
        self_wordEm, self_contextW, self_biases, self_delta_c, self_delta_r point to data which is shared among parent and child processes
        '''
        def worker(model, self_delta_c, self_delta_r, barrier, lock1, lock2,
                   queue):
            self_delta_r = utils.toNumpyArray(self_delta_r, np.float32,
                                              (len(model.vocab), model.dim))
            self_delta_c = [
                utils.toNumpyArray(self_delta_c[i], np.float32,
                                   (model.dim, model.dim))
                for i in range(model.context)
            ]
            # delta_c and delta_r are local to a child process, deltas will be stored in them.
            # after finishing its task, a child process will add them to their counterparts in
            # the parent process via self_delta_r and self_delta_c
            delta_c = [
                np.zeros((model.dim, model.dim), np.float32)
                for i in range(model.context)
            ]
            delta_r = np.zeros((len(model.vocab), model.dim), np.float32)

            # the index of a rare word
            RARE = model.vocab['<>']
            # work_d and work_v are reused in train_sentence_fast
            work_d = np.empty(model.dim, np.float32)
            work_v = np.empty(len(model.vocab), np.float32)
            while True:
                task = queue.get()
                if task is None:
                    break
                for sentence in task:
                    # null padding has a special index of -1
                    indices = map(
                        lambda w: -1
                        if w == '<_>' else model.vocab.get(w, RARE), sentence)
                    indices = np.asarray(indices, np.int32)
                    train_sentence(model, indices, delta_c, delta_r, work_d,
                                   work_v)

                lock1.acquire()
                for i in range(model.context):
                    self_delta_c[i] += delta_c[i]
                lock1.release()
                lock2.acquire()
                self_delta_r += delta_r
                lock2.release()
                barrier.sync()

                for i in range(model.context):
                    delta_c[i].fill(0)
                delta_r.fill(0)

        args = (self, delta_c_raw, delta_r_raw, barrier, lock1, lock2, queue)
        pool = [Process(target=worker, args=args) for i in range(workers)]
        for p in pool:
            p.daemon = True
            p.start()

        distributor = utils.generateTasks(iter(sentences), self.l_pad,
                                          self.r_pad, workers, batches)
        start = time.time()
        for tasks in distributor:
            for i in range(workers):
                queue.put(tasks[i], block=False)
            count += batches
            alpha = self.min_alpha + (self.alpha - self.min_alpha) * (
                1 - 1.0 * count / self.total)
            barrier.sync()
            # this point, all child processes have finished their task and parent can update safely
            for i in range(self.context):
                self.contextW[i] -= (delta_c[i] +
                                     1e-5 * self.contextW[i]) * alpha
            self.wordEm -= (delta_r + 1e-4 * self.wordEm) * alpha
            for i in range(self.context):
                delta_c[i].fill(0)
            delta_r.fill(0)
            elapsed = time.time() - start
            print('visited {0} words, with {1:.2f} Ws/s, alpha: {2}.'.format(
                count, count / elapsed, alpha))
        # notify processes to exit
        for i in range(workers):
            queue.put(None)
        for p in pool:
            p.join()
        print('Training is finished!')
    def train(self, sentences, alpha = 0.001, min_alpha = 0.001, batches = 1000, workers = 4):
        print('Start training...')
        self.alpha = alpha
        self.min_alpha = min_alpha
        count = 0
        # barrier is used to sync parent and all workers
        barrier = utils.getBarrier(workers + 1)
        lock1 = Lock()
        lock2 = Lock()
        queue = Queue(workers)
        # delta_c_raw contains context weights for each position, they are shared, so each child process can 
        # add their delta on them. delta_c is a numpy wrapper which makes the parent process handle it easily
        delta_c_raw = [utils.getSharedArray('f', self.dim * self.dim) for i in range(self.context) ]
        delta_c = [utils.toNumpyArray(delta_c_raw[i], np.float32, (self.dim, self.dim) ) for i in range(self.context) ]
        delta_r_raw = utils.getSharedArray('f', len(self.vocab) * self.dim) 
        delta_r = utils.toNumpyArray(delta_r_raw, np.float32, (len(self.vocab), self.dim) )
        


        '''
        vocab: dictionary containing each word and its index, it's copied from the parent process
        self_wordEm, self_contextW, self_biases, self_delta_c, self_delta_r point to data which is shared among parent and child processes
        '''
        def worker(model, self_delta_c, self_delta_r, barrier, lock1, lock2, queue):
            self_delta_r = utils.toNumpyArray(self_delta_r, np.float32, (len(model.vocab), model.dim) )
            self_delta_c = [utils.toNumpyArray(self_delta_c[i], np.float32, (model.dim, model.dim) ) for i in range(model.context) ]
            # delta_c and delta_r are local to a child process, deltas will be stored in them.
            # after finishing its task, a child process will add them to their counterparts in 
            # the parent process via self_delta_r and self_delta_c
            delta_c = [np.zeros((model.dim, model.dim), np.float32) for i in range(model.context) ]
            delta_r = np.zeros((len(model.vocab), model.dim), np.float32)

            # the index of a rare word
            RARE = model.vocab['<>']
            # work_d and work_v are reused in train_sentence_fast
            work_d = np.empty(model.dim, np.float32)
            work_v = np.empty(len(model.vocab), np.float32)
            while True:
                task = queue.get()
                if task is None:
                    break
                for sentence in task:
                    # null padding has a special index of -1
                    indices = map(lambda w: -1 if w == '<_>' else model.vocab.get(w, RARE), sentence)
                    indices = np.asarray(indices, np.int32)
                    train_sentence(model, indices, delta_c, delta_r, work_d, work_v)
                
                lock1.acquire()
                for i in range(model.context):
                    self_delta_c[i] += delta_c[i]
                lock1.release()
                lock2.acquire()
                self_delta_r += delta_r
                lock2.release()
                barrier.sync()

                for i in range(model.context):
                    delta_c[i].fill(0)
                delta_r.fill(0)


        
        args = (self, delta_c_raw, delta_r_raw, barrier, lock1, lock2, queue)
        pool = [Process(target = worker, args = args)  for i in range(workers) ]
        for p in pool:
            p.daemon = True
            p.start()
        
        distributor = utils.generateTasks(iter(sentences), self.l_pad, self.r_pad, workers, batches)
        start = time.time()
        for tasks in distributor:
            for i in range(workers):
                queue.put(tasks[i], block = False)                
            count += batches
            alpha = self.min_alpha + (self.alpha - self.min_alpha) * (1 - 1.0 * count / self.total)
            barrier.sync()
            # this point, all child processes have finished their task and parent can update safely
            for i in range(self.context):
                self.contextW[i] -= (delta_c[i] + 1e-5 * self.contextW[i]) * alpha
            self.wordEm -= (delta_r + 1e-4 * self.wordEm) * alpha
            for i in range(self.context):
                delta_c[i].fill(0)
            delta_r.fill(0)
            elapsed = time.time() - start
            print('visited {0} words, with {1:.2f} Ws/s, alpha: {2}.'.format(count, count / elapsed, alpha) )
        # notify processes to exit
        for i in range(workers):
            queue.put(None)
        for p in pool:
            p.join()
        print('Training is finished!')