Example #1
0
    def train(self, nodes, model, beta, chunksize=150, iter=1):
        for _ in range(iter):
            grad_input = np.zeros(model.node_embedding.shape).astype(
                np.float32)
            for node_index in chunkize_serial(
                    map(
                        lambda node: model.vocab[node].index,
                        filter(
                            lambda node: node in model.vocab and
                            (model.vocab[node].sample_probability >= 1.0 or
                             model.vocab[node].sample_probability >= np.random.
                             random_sample()), nodes)), chunksize):
                input = model.node_embedding[node_index]
                batch_grad_input = np.zeros(input.shape).astype(np.float32)

                for com in range(model.k):
                    diff = np.expand_dims(input - model.centroid[com], axis=-1)
                    m = model.pi[node_index, com].reshape(
                        len(node_index), 1,
                        1) * (model.inv_covariance_mat[com])

                    batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1)
                grad_input[node_index] += batch_grad_input

            grad_input *= (beta / model.k)

            model.node_embedding -= (grad_input.clip(min=-0.25,
                                                     max=0.25)) * self.lr
    def loss(self, nodes, model, beta, chunksize=150):
        """
        Forward function used to compute o3 loss
        :param input_labels: of the node present in the batch
        :param model: model containing all the shared data
        :param beta: trade off param
        """
        ret_loss = 0
        for node_index in chunkize_serial(
                map(lambda x: model.vocab[x].index, nodes), chunksize):
            input = model.node_embedding[node_index]

            batch_loss = np.zeros(len(node_index), dtype=np.float32)
            for com in range(model.k):
                rd = multivariate_normal(model.centroid[com],
                                         model.covariance_mat[com])
                # check if can be done as matrix operation
                #batch_loss += rd.logpdf(input).astype(np.float32) * model.pi[node_index, com]
                batch_loss += rd.logpdf(input).astype(
                    np.float32) * model.pi[node_index, com]

            #ret_loss = abs(batch_loss.sum())
            ret_loss = -(batch_loss.sum())

        return ret_loss * (beta / model.k)
Example #3
0
    def loss(self, model, paths, total_paths, alpha=1.0):
        start, next_report, loss = time.time(), 5.0, 0.0

        num_nodes = 0

        for job_no, job in enumerate(
                chunkize_serial(prepare_sentences(model, paths), 250)):
            batch_loss = np.zeros(1, dtype=np.float32)
            batch_work = np.zeros(model.layer1_size, dtype=np.float32)

            batch_node = sum([
                loss_o2(model.node_embedding,
                        model.context_embedding,
                        path,
                        self.negative,
                        self.window_size,
                        model.table,
                        alpha,
                        model.layer1_size,
                        batch_work,
                        py_loss=batch_loss) for path in job if path is not None
            ])
            num_nodes += batch_node
            loss += batch_loss[0]
            elapsed = time.time() - start
            if elapsed >= next_report:
                log.debug("PROGRESS: at %.2f%% path, %.0f paths/s" %
                          (100.0 * num_nodes / total_paths,
                           num_nodes / elapsed if elapsed else 0.0))
                # log.debug("loss: {}".format(loss))
                next_report = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        # def worker_loss(job, next_report):
        #     """Train the model, lifting lists of paths from the jobs queue."""
        #
        #     py_work = np.zeros(model.layer1_size, dtype=np.float32)
        #     job_nodes = sum([loss_o2(model.node_embedding, model.context_embedding, path, self.negative,
        #                             self.window_size, model.table, alpha, model.layer1_size,
        #                             py_work, py_loss=loss) for path in job])  # execute the sgd
        #     num_nodes[0] += job_nodes
        #     elapsed = time.time() - start
        #
        #     if elapsed >= next_report:
        #         print("PROGRESS: at %.2f%% path, %.0f paths/s" % (
        #         100.0 * num_nodes[0] / total_paths, num_nodes[0] / elapsed if elapsed else 0.0))
        #         next_report = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
        #         print(loss)
        #     return next_report
        #
        # for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, paths), 250)):
        #     next_report = worker_loss(job, next_report)

        log.info(num_nodes)
        log.info(loss)
        return loss
Example #4
0
    def train(self, nodes, model, beta, chunksize=150, iter=1):
        for _ in range(iter):
            grad_input = np.zeros(model.node_embedding.shape).astype(np.float32)
            for node_index in chunkize_serial(map(lambda x: model.vocab[x].index, nodes), chunksize):
                input = model.node_embedding[node_index]
                batch_grad_input = np.zeros(input.shape).astype(np.float32)

                for com in range(model.k):
                    diff = np.expand_dims(input - model.centroid[com], axis=-1)
                    m = model.pi[node_index, com].reshape(len(node_index), 1, 1) * model.inv_covariance_mat[com]

                    batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1)
                grad_input[node_index] += batch_grad_input


            grad_input *= (beta/model.k)

            model.node_embedding -= (grad_input.clip(min=-5, max=5)) * self.lr
Example #5
0
    def train(self, nodes, model, beta, chunksize=150, iter=1):
        """
        :param nodes:
        :param model: model containing all the shared data
        :param beta: trade off param
        :param chunksize:
        :param iter:
        """

        log.info(f"O2 COMMUNITY training model with beta={beta}, chunksize={chunksize}, and iter={iter}")

        start = time.time()

        for i in range(iter):
            grad_input = np.zeros(model.node_embedding.shape).astype(np.float32)
            for node_index in chunkize_serial(map(lambda node: model.vocab[node].index,
                                                  filter(lambda node: node in model.vocab and (
                                                          model.vocab[node].sample_probability >= 1.0 or model.vocab[
                                                      node].sample_probability >= np.random.random_sample()), nodes)),
                                              chunksize):
                input = model.node_embedding[node_index]
                batch_grad_input = np.zeros(input.shape).astype(np.float32)

                for com in range(model.k):
                    diff = np.expand_dims(input - model.centroid[com], axis=-1)
                    m = model.pi[node_index, com].reshape(len(node_index), 1, 1) * (model.inv_covariance_mat[com])

                    batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1)
                grad_input[node_index] += batch_grad_input

            grad_input *= (beta / model.k)

            model.node_embedding -= (grad_input.clip(min=-0.25, max=0.25)) * self.lr

            log.info(f"PROGRESS: at {i/iter*100:.2f}%")

        elapsed = time.time() - start
        log.info(f"training on took {elapsed:.1f}s")
    def train(self, nodes, model, beta, chunksize=150, iter=1):
        for _ in range(iter):
            grad_input = np.zeros(model.node_embedding.shape).astype(
                np.float32)
            for node_index in chunkize_serial(
                    map(lambda node: model.vocab[node].index, nodes),
                    chunksize):
                input = model.node_embedding[node_index]
                batch_grad_input = np.zeros(input.shape).astype(np.float32)

                diff = np.expand_dims(input, axis=1) - np.expand_dims(
                    model.centroid, axis=0)
                diff = np.transpose(diff, (1, 0, 2))
                for k, (d, inv_cov) in enumerate(
                        zip(diff, model.inv_covariance_mat)):
                    batch_grad_input += model.pi[k] * np.sum(
                        inv_cov * np.expand_dims(d, 1), 1)
                grad_input[node_index] += batch_grad_input

            grad_input *= (beta / model.k)

            model.node_embedding -= (grad_input.clip(min=-0.25,
                                                     max=0.25)) * self.lr
Example #7
0
    def train(self, model, edges, chunksize=150, iter=1):
        """
        Update the model's neural weights from a sequence of paths (can be a once-only generator stream).
        """
        assert model.node_embedding.dtype == np.float32

        log.info("O1 training model with %i workers on %i vocabulary and %i features and 'negative sampling'=%s" %
                    (self.workers, len(model.vocab), model.layer1_size, self.negative))

        if not model.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        edges = RepeatCorpusNTimes(edges, iter)
        total_node = edges.corpus.shape[0] * edges.corpus.shape[1] * edges.n
        log.debug('total edges: %d' % total_node)
        start, next_report, word_count = time.time(), [5.0], [0]


        #int(sum(v.count * v.sample_probability for v in self.vocab.values()))
        jobs = Queue(maxsize=2*self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()


        def worker_train():
            """Train the model, lifting lists of paths from the jobs queue."""
            while True:
                job = jobs.get(block=True)
                if job is None:  # data finished, exit
                    jobs.task_done()
                    # print('thread %s break' % threading.current_thread().name)
                    break


                py_work = np.zeros(model.layer1_size, dtype=np.float32)

                job_words = sum(train_o1(model.node_embedding, edge, self.lr, self.negative, model.table,
                                         py_size=model.layer1_size, py_work=py_work) for edge in job if edge is not None)
                jobs.task_done()
                lock.acquire(timeout=30)
                try:
                    word_count[0] += job_words

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        log.info("PROGRESS: at %.2f%% words\tword_computed %d\talpha %.05f\t %.0f words/s" %
                                    (100.0 * word_count[0] / total_node, word_count[0], self.lr, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 5.0  # don't flood the log, wait at least a second between progress reports
                finally:
                    lock.release()



        workers = [threading.Thread(target=worker_train, name='thread_'+str(i)) for i in range(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()


        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, edges), chunksize)):
            jobs.put(job)


        for _ in range(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        log.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed, word_count[0]/ elapsed if elapsed else 0.0))
Example #8
0
    def train(self,
              model,
              paths,
              total_nodes,
              alpha=1.0,
              node_count=0,
              chunksize=150):
        """
        Update the model's neural weights from a sequence of paths (can be a once-only generator stream).

        :param model: model containing the shared data
        :param paths: generator of the paths
        :param total_nodes: total number of nodes in the path
        :param alpha: trade-off parameter
        :param node_count: init of the number of nodes
        :param chunksize: size of the batch
        :return:
        """
        assert model.node_embedding.dtype == np.float32
        assert model.context_embedding.dtype == np.float32
        log.info(
            "O3 CONTEXT training model with %i workers on %i vocabulary and %i features, using \t'negative sampling'=%s\t'windows'=%s"
            % (self.workers, len(model.vocab), model.layer1_size,
               self.negative, self.window_size))

        if alpha <= 0.:
            return

        if not model.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        if total_nodes is None:
            raise AttributeError('need the number of node')

        node_count = [0]

        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of nodes trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of paths from the jobs queue."""
            py_work = np.zeros(model.layer1_size, dtype=np.float32)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break

                lr = max(self.min_lr,
                         self.lr * (1 - 1.0 * node_count[0] / total_nodes))
                job_nodes = sum(
                    train_o2(model.node_embedding,
                             model.context_embedding,
                             path,
                             lr,
                             self.negative,
                             self.window_size,
                             model.table,
                             py_alpha=alpha,
                             py_size=model.layer1_size,
                             py_work=py_work)
                    for path in job)  #execute the sgd

                with lock:
                    node_count[0] += job_nodes

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        log.info(
                            "PROGRESS: at %.2f%% nodes, lr %.05f, %.0f nodes/s"
                            % (100.0 * node_count[0] / total_nodes, lr,
                               node_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in range(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (eliding OOV/downsampled nodes), and start filling the jobs queue
        for job_no, job in enumerate(
                chunkize_serial(prepare_sentences(model, paths), chunksize)):
            jobs.put(job)

        log.debug(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in range(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        log.info("training on %i nodes took %.1fs, %.0f nodes/s" %
                 (node_count[0], elapsed,
                  node_count[0] / elapsed if elapsed else 0.0))