Beispiel #1
0
    def loss(self, model, paths, total_paths, alpha=1.0):
        start, next_report, loss = time.time(), 5.0, 0.0

        num_nodes = 0

        for job_no, job in enumerate(
                chunkize_serial(prepare_sentences(model, paths), 250)):
            batch_loss = np.zeros(1, dtype=np.float32)
            batch_work = np.zeros(model.layer1_size, dtype=np.float32)

            batch_node = sum([
                loss_o2(model.node_embedding,
                        model.context_embedding,
                        path,
                        self.negative,
                        self.window_size,
                        model.table,
                        alpha,
                        model.layer1_size,
                        batch_work,
                        py_loss=batch_loss) for path in job if path is not None
            ])
            num_nodes += batch_node
            loss += batch_loss[0]
            elapsed = time.time() - start
            if elapsed >= next_report:
                log.debug("PROGRESS: at %.2f%% path, %.0f paths/s" %
                          (100.0 * num_nodes / total_paths,
                           num_nodes / elapsed if elapsed else 0.0))
                # log.debug("loss: {}".format(loss))
                next_report = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        # def worker_loss(job, next_report):
        #     """Train the model, lifting lists of paths from the jobs queue."""
        #
        #     py_work = np.zeros(model.layer1_size, dtype=np.float32)
        #     job_nodes = sum([loss_o2(model.node_embedding, model.context_embedding, path, self.negative,
        #                             self.window_size, model.table, alpha, model.layer1_size,
        #                             py_work, py_loss=loss) for path in job])  # execute the sgd
        #     num_nodes[0] += job_nodes
        #     elapsed = time.time() - start
        #
        #     if elapsed >= next_report:
        #         print("PROGRESS: at %.2f%% path, %.0f paths/s" % (
        #         100.0 * num_nodes[0] / total_paths, num_nodes[0] / elapsed if elapsed else 0.0))
        #         next_report = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
        #         print(loss)
        #     return next_report
        #
        # for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, paths), 250)):
        #     next_report = worker_loss(job, next_report)

        log.info(num_nodes)
        log.info(loss)
        return loss
Beispiel #2
0
 def loss(self, model, edges):
     ret_loss = 0
     for edge in prepare_sentences(model, edges):
         assert len(
             edge) == 2, "edges have to be done by 2 nodes :{}".format(edge)
         edge_loss = np.log(
             sigmoid(
                 np.dot(model.node_embedding[edge[1].index],
                        model.node_embedding[edge[0].index].T)))
         assert edge_loss <= 0, "malformed loss"
         ret_loss -= edge_loss
     return ret_loss
Beispiel #3
0
def learn_second(network,
                 lr,
                 model,
                 examples_files,
                 total_example,
                 alpha=1.0,
                 batch_size=20):
    """
    Helper function used to optimize O2
    :param network: network model to optimize
    :param lr: learning rate
    :param model: model containing the shared data
    :param examples_files: list of files containing the examples
    :param total_example: total example for training
    :param alpha: trade-off param
    :param batch_size: size of the batch
    :return: loss value
    """

    num_batch = 0

    log.info("compute o2")
    optimizer = SGD(network.parameters(), lr)
    log.debug("read example file: {}".format("\t".join(examples_files)))
    loss_val = 0

    if alpha <= 0:
        return loss_val

    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, graph_utils.combine_example_files_iter(examples_files),
            network.transfer_fn(model.vocab)),
                                           batch_size,
                                           long_tensor=LongTensor):
        input, output = batch
        loss = (alpha * network.forward(
            input, output, negative_sampling_fn=model.negative_sample))
        loss_val += loss.data[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / (total_example / batch_size)))

    log.debug("O2 loss: {}".format(loss_val))
    return loss_val
Beispiel #4
0
def learn_community(network, lr, model, nodes, beta=1.0, batch_size=20):
    """
    Helper function used to optimize O3
    :param network: model to optimize
    :param lr: learning rate
    :param model: model containing the shared data
    :param nodes: nodes on which execute the learning
    :param beta: trade-off value
    :param batch_size: size of the batch
    :return: loss value
    """

    num_batch = 0

    log.info("compute o3")
    optimizer = SGD(network.parameters(), lr)
    loss_val = 0

    if beta <= 0.:
        return loss_val

    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, nodes, network.transfer_fn()),
                                           batch_size,
                                           long_tensor=LongTensor):

        input, output = batch
        loss = network.forward(input, model)
        loss.data *= (beta / model.k)
        loss_val += loss.data[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / (total_example / batch_size)))

    log.debug("O3 loss: {}".format(loss_val))
    return loss_val
Beispiel #5
0
def learn_first(network, lr, model, edges, num_iter=1, batch_size=20):
    """
    Helper function used to optimize O1
    :param network: neural network to train
    :param lr: learning rate
    :param model: model containing the shared data
    :param edges: numpy list of edges used for training
    :param num_iter: iteration number over the edges
    :param batch_size: size of the batch
    :return: loss value
    """
    log.info("computing o1")
    optimizer = SGD(network.parameters(), lr)

    num_batch = 0
    total_batch = (edges.shape[0] * num_iter) / batch_size
    loss_val = 0
    for batch in emb_utils.batch_generator(emb_utils.prepare_sentences(
            model, edges, network.transfer_fn(model.vocab)),
                                           batch_size,
                                           long_tensor=LongTensor):

        input, output = batch
        loss = network.forward(input,
                               output,
                               negative_sampling_fn=model.negative_sample)

        loss_val += loss.data[0]
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        num_batch += 1

        if (num_batch) % 10000 == 0:
            log.info("community embedding batches completed: {}".format(
                num_batch / total_batch))

    log.debug("O1 loss: {}".format(loss_val))
    return loss_val
Beispiel #6
0
def learn_second(network, lr, model, examples_files, alpha=1.0):
    """
    Helper function used to optimize O1 and O3
    :param loss: loss to optimize
    :param lr: learning rate
    :param model: deprecated_model used to compute the batches and the negative sampling
    :param examples_files: list of files containing the examples
    :param num_iter: iteration number over the edges
    :return: 
    """
    log.info("compute o2")
    optimizer = SGD(network.parameters(), lr)
    log.debug("read example file: {}".format("\t".join(examples_files)))
    for batch in emb_utils.batch_generator(
            emb_utils.prepare_sentences(
                model, graph_utils.combine_example_files_iter(examples_files),
                network.transfer_fn(model.vocab)), 20):
        input, output = batch
        loss = (alpha * network.forward(
            input, output, negative_sampling_fn=model.negative_sample))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Beispiel #7
0
def learn_first(network, lr, model, edges, num_iter=1):
    """
    Helper function used to optimize O1 and O3
    :param network: neural network to train
    :param lr: learning rate
    :param model: deprecated_model used to compute the batches and the negative sampling
    :param edges: numpy list of edges used for training
    :param num_iter: iteration number over the edges
    :return: 
    """
    log.info("computing o1")
    optimizer = SGD(network.parameters(), lr)
    for batch in emb_utils.batch_generator(
            emb_utils.prepare_sentences(
                model, emb_utils.RepeatCorpusNTimes(edges, n=num_iter),
                network.transfer_fn(model.vocab)), 20):
        input, output = batch
        loss = network.forward(input,
                               output,
                               negative_sampling_fn=model.negative_sample)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Beispiel #8
0
    def train(self, model, edges, chunksize=150, iter=1):
        """
        Update the model's neural weights from a sequence of paths (can be a once-only generator stream).
        """
        assert model.node_embedding.dtype == np.float32

        log.info("O1 training model with %i workers on %i vocabulary and %i features and 'negative sampling'=%s" %
                    (self.workers, len(model.vocab), model.layer1_size, self.negative))

        if not model.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        edges = RepeatCorpusNTimes(edges, iter)
        total_node = edges.corpus.shape[0] * edges.corpus.shape[1] * edges.n
        log.debug('total edges: %d' % total_node)
        start, next_report, word_count = time.time(), [5.0], [0]


        #int(sum(v.count * v.sample_probability for v in self.vocab.values()))
        jobs = Queue(maxsize=2*self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()


        def worker_train():
            """Train the model, lifting lists of paths from the jobs queue."""
            while True:
                job = jobs.get(block=True)
                if job is None:  # data finished, exit
                    jobs.task_done()
                    # print('thread %s break' % threading.current_thread().name)
                    break


                py_work = np.zeros(model.layer1_size, dtype=np.float32)

                job_words = sum(train_o1(model.node_embedding, edge, self.lr, self.negative, model.table,
                                         py_size=model.layer1_size, py_work=py_work) for edge in job if edge is not None)
                jobs.task_done()
                lock.acquire(timeout=30)
                try:
                    word_count[0] += job_words

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        log.info("PROGRESS: at %.2f%% words\tword_computed %d\talpha %.05f\t %.0f words/s" %
                                    (100.0 * word_count[0] / total_node, word_count[0], self.lr, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 5.0  # don't flood the log, wait at least a second between progress reports
                finally:
                    lock.release()



        workers = [threading.Thread(target=worker_train, name='thread_'+str(i)) for i in range(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()


        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, edges), chunksize)):
            jobs.put(job)


        for _ in range(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        log.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed, word_count[0]/ elapsed if elapsed else 0.0))
Beispiel #9
0
    def train(self,
              model,
              paths,
              total_nodes,
              alpha=1.0,
              node_count=0,
              chunksize=150):
        """
        Update the model's neural weights from a sequence of paths (can be a once-only generator stream).

        :param model: model containing the shared data
        :param paths: generator of the paths
        :param total_nodes: total number of nodes in the path
        :param alpha: trade-off parameter
        :param node_count: init of the number of nodes
        :param chunksize: size of the batch
        :return:
        """
        assert model.node_embedding.dtype == np.float32
        assert model.context_embedding.dtype == np.float32
        log.info(
            "O3 CONTEXT training model with %i workers on %i vocabulary and %i features, using \t'negative sampling'=%s\t'windows'=%s"
            % (self.workers, len(model.vocab), model.layer1_size,
               self.negative, self.window_size))

        if alpha <= 0.:
            return

        if not model.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        if total_nodes is None:
            raise AttributeError('need the number of node')

        node_count = [0]

        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of nodes trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of paths from the jobs queue."""
            py_work = np.zeros(model.layer1_size, dtype=np.float32)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break

                lr = max(self.min_lr,
                         self.lr * (1 - 1.0 * node_count[0] / total_nodes))
                job_nodes = sum(
                    train_o2(model.node_embedding,
                             model.context_embedding,
                             path,
                             lr,
                             self.negative,
                             self.window_size,
                             model.table,
                             py_alpha=alpha,
                             py_size=model.layer1_size,
                             py_work=py_work)
                    for path in job)  #execute the sgd

                with lock:
                    node_count[0] += job_nodes

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        log.info(
                            "PROGRESS: at %.2f%% nodes, lr %.05f, %.0f nodes/s"
                            % (100.0 * node_count[0] / total_nodes, lr,
                               node_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in range(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (eliding OOV/downsampled nodes), and start filling the jobs queue
        for job_no, job in enumerate(
                chunkize_serial(prepare_sentences(model, paths), chunksize)):
            jobs.put(job)

        log.debug(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in range(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        log.info("training on %i nodes took %.1fs, %.0f nodes/s" %
                 (node_count[0], elapsed,
                  node_count[0] / elapsed if elapsed else 0.0))