Example #1
0
    def event_context_pairs(
        sequence: TYPE_TENSOR,
        window_size: int,
        event_size: int,
    ) -> TYPE_TENSOR:
        """Create (event, context) pairs from a sequence
        For a sequence (a, b, c, d, e) of window_size 3 and event_size 1.
        (event, context) = [
            event=b, context=(a, c),
            event=c, context=(b, d),
            event=d, context=(e, e)
        ]

        Args:
            sequence: 1 dimensional tensor
            window_size: size of the context including the event
            event_size: size of the event which can be > 1

        Returns: (event, context) pairs of shape:(N, E+C) where N=num_windows.
        [
            [b, a, c],
            [c, b, d],
            [d, e, e]
        ] where the first column(s) is event and the rest are context
        """
        length: TYPE_INT = TYPE_INT(len(sequence))
        stride: TYPE_INT = TYPE_INT((window_size - event_size) / 2)

        assert \
            super(Function, Function).is_tensor(sequence) and \
            super(Function, Function).tensor_rank(sequence) == 1 and \
            length >= window_size > event_size > 0, \
            f"Expected a sequence of length >= {window_size} but {sequence}"
        assert (window_size -
                event_size) % 2 == 0, "Need stride as integer > 0"

        # --------------------------------------------------------------------------------
        # The result is a matrix in which windows are stacked up.
        # The result shape is (length-window_size+1, window_size).
        # --------------------------------------------------------------------------------
        num_windows = length - window_size + 1
        context_windows = np.array([
            sequence[slice(index, index + window_size)]
            for index in range(0, num_windows)
        ],
                                   dtype=TYPE_INT)
        assert context_windows.shape == (num_windows, window_size)
        event_context_paris = np.c_[context_windows[  # Labels
            ::, slice(stride,
                      (stride + event_size))], context_windows[  # Left context
                          ::, slice(0, stride)],
                                    context_windows[  # Right context
                                        ::,
                                        slice((event_size + stride), None)]]
        assert event_context_paris.shape == (num_windows, window_size)
        return event_context_paris
def test_020_event_context_instance_properties(caplog):
    """
    Objective:
        Verify the layer class validates the parameters have been initialized before accessed.
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_event_context_instance_properties"
    msg = "Accessing uninitialized property of the layer must fail."

    profiler = cProfile.Profile()
    profiler.enable()
    for _ in range(NUM_MAX_TEST_TIMES):
        stride: TYPE_INT = TYPE_INT(np.random.randint(1, 100))
        event_size: TYPE_INT = TYPE_INT(np.random.randint(1, 100))
        window_size: TYPE_INT = 2 * stride + event_size

        name = random_string(np.random.randint(1, 10))
        event_context = _must_succeed(name=name,
                                      num_nodes=TYPE_INT(1),
                                      window_size=window_size,
                                      event_size=event_size,
                                      msg=msg)

        # --------------------------------------------------------------------------------
        # To pass
        # --------------------------------------------------------------------------------
        try:
            if not event_context.name == name:
                raise RuntimeError("event_context.name == name should be true")
        except AssertionError as e:
            raise RuntimeError(
                "Access to name should be allowed as already initialized."
            ) from e

        try:
            if not isinstance(event_context.logger, logging.Logger):
                raise RuntimeError(
                    "isinstance(event_context.logger, logging.Logger) should be true"
                )
        except AssertionError as e:
            raise RuntimeError(
                "Access to logger should be allowed as already initialized."
            ) from e

        assert event_context.window_size == window_size
        assert event_context.event_size == event_size

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def _instantiate(
        name: str = __name__,
        num_nodes: TYPE_INT = 1,
        target_size: TYPE_INT = 1,
        context_size: TYPE_INT = 4,
        negative_sample_size: TYPE_INT = 10,
        event_vector_size: TYPE_INT = 20,
        dictionary: EventIndexing = None,
        W: TYPE_TENSOR = None,
        log_level: int = logging.ERROR
) -> Tuple[Embedding, EventContext]:

    event_context: EventContext = _instantiate_event_context(
        name=name,
        num_nodes=TYPE_INT(1),
        window_size=(context_size+target_size),
        event_size=target_size
    )

    embedding: Embedding = Embedding(
        name=name,
        num_nodes=num_nodes,
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        W=W,
        log_level=log_level
    )

    return embedding, event_context
def test_020_embedding_function_multi_lines(caplog):
    """
    Objective:
        Verify the EventIndexing function can handle multi line sentences
    Expected:
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_function_multi_lines"

    sentences = """
    Verify the EventIndexing function can handle multi line sentences
    the asbestos fiber <unk> is unusually <unk> once it enters the <unk> 
    with even brief exposures to it causing symptoms that show up decades later researchers said
    """

    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        target_size = TYPE_INT(np.random.randint(1, 3))
        context_size = TYPE_INT(2 * np.random.randint(1, 5))
        negative_sample_size = TYPE_INT(np.random.randint(1, 5))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )

        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)
        embedding.function(target_context_pairs)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_010_standardize_sd_is_zero_eps():
    """
    Objective:
        Verify the standardize() function when SD is zero.
    Expected:
        standardized == (X-mean)/sqrt(eps)
    """
    name = "test_010_standardize"
    keepdims = True
    u = TYPE_FLOAT(1e-6)
    eps = TYPE_FLOAT(1e-8)

    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = np.random.randint(2, NUM_MAX_NODES)
        row = np.random.uniform(-MAX_ACTIVATION_VALUE, MAX_ACTIVATION_VALUE,
                                M).astype(TYPE_FLOAT)
        X = np.ones(shape=(N, M), dtype=TYPE_FLOAT) * row
        Logger.debug("%s: X \n%s\n", name, X)

        # Constraint: standardize(X) == (X - np.mean(A)) / np.std(X)
        ddof = TYPE_INT(1) if N > 1 else TYPE_INT(0)
        sd = np.std(X, axis=0, keepdims=keepdims, ddof=ddof)
        assert np.allclose(sd, TYPE_FLOAT(0), atol=u, rtol=0)

        # Expected
        mean = np.mean(X, axis=0, dtype=TYPE_FLOAT)
        E = (X - mean) / np.sqrt(eps, dtype=TYPE_FLOAT)

        # Actual
        A, __mean, __sd, _ = standardize(X, keepdims=keepdims, eps=eps)

        # Constraint. mean/sd should be same
        assert np.allclose(mean, __mean, atol=u, rtol=TYPE_FLOAT(0))
        assert np.allclose(__sd, np.sqrt(eps), atol=u, rtol=TYPE_FLOAT(0))
        assert np.all(np.abs(E-A) < u), \
            f"X\n{X}\nstandardized\n{E}\nneeds\n{A}\n"
def test_020_event_context_function_event_size_2(caplog):
    """
    Objective:
        Verify the layer function with event_size = 2
    Expected:
        function generates expected event_context pairs
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_event_context_instance_properties"
    msg = "Accessing uninitialized property of the layer must fail."

    profiler = cProfile.Profile()
    profiler.enable()

    stride: TYPE_INT = 2
    event_size: TYPE_INT = 2
    window_size: TYPE_INT = 2 * stride + event_size
    num_nodes = TYPE_INT(1)

    event_context = _must_succeed(name=name,
                                  num_nodes=num_nodes,
                                  window_size=window_size,
                                  event_size=event_size,
                                  msg=msg)

    # 0, 1, 2, 3, 4, 5
    # expected event_context pair: [
    #   [2, 3, 0, 1, 4, 5],
    X1 = np.arange(6)
    expected1 = np.array([[2, 3, 0, 1, 4, 5]])
    Y1 = event_context.function(X1)
    assert \
        np.array_equal(Y1, expected1), \
        "Expected\n%s\nActual\n%s\n" % (expected1, Y1)

    # 0, 1, 2, 3, 4, 5, 6
    # expected event_context pair: [
    #   [2, 3, 0, 1, 4, 5],
    X2 = np.arange(7)
    expected2 = np.array([[2, 3, 0, 1, 4, 5], [3, 4, 1, 2, 5, 6]])
    Y2 = event_context.function(X2)
    assert np.array_equal(Y2, expected2), \
        "Expected\n%s\nActual\n%s\n" % (expected2, Y2)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_std_method_function_to_succeed():
    """
    Objective:
        Verify the _layer class instance function method
    Expected:
        Layer method calculate expected values.
    """
    def objective(x: TYPE_TENSOR):
        """Dummy objective function"""
        return np.sum(x, dtype=TYPE_FLOAT)

    profiler = cProfile.Profile()
    profiler.enable()
    for _ in range(NUM_MAX_TEST_TIMES):
        name = random_string(np.random.randint(1, 10))
        numexpr_enabled = bool(np.random.randint(0, 2))
        numba_enabled = bool(np.random.randint(0, 2))

        # For which works on statistics on per-feature basis,
        # no sense if M = 1 or N = 1.
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = np.random.randint(2, NUM_MAX_NODES)

        X = np.random.randn(N, M).astype(TYPE_FLOAT)
        momentum = TYPE_FLOAT(np.random.uniform(0.7, 0.99))
        eps = TYPE_FLOAT(np.random.uniform(1e-12, 1e-10)) \
            if np.random.uniform() < 0.5 else TYPE_FLOAT(0)
        _layer: Standardization = \
            _instance(name=name, num_nodes=M, momentum=momentum, eps=eps)
        _layer.objective = objective

        # ********************************************************************************
        # Constraint: total_rows_processed = times_of_invocations * N
        # ********************************************************************************
        assert _layer.total_rows_processed == 0
        ru = _layer.RU
        rsd = _layer.RSD
        _layer.function(
            X,
            numexpr_enabled=numexpr_enabled,
        )
        _validate_layer_values(_layer, X, eps=eps)
        _validate_layer_running_statistics(
            _layer=_layer, previous_ru=ru, previous_rsd=rsd, X=X, eps=eps
        )

        # ********************************************************************************
        # Constraint:
        #   _layer.N provides the latest X.shape[0]
        #   X related arrays should have its storage allocated and has the X.shape.
        #   * dX
        #   * dXmd01
        #   * dXmd02
        # ********************************************************************************
        assert _layer.N == X.shape[0]
        assert \
            _layer.dX.dtype == TYPE_FLOAT and \
            _layer.dX.shape == (N, M)

        assert \
            _layer.dXmd01.dtype == TYPE_FLOAT and \
            _layer.dXmd01.shape == (N, M)

        assert \
            _layer.dXmd02.dtype == TYPE_FLOAT and \
            _layer.dXmd02.shape == (N, M)
        assert _layer.total_rows_processed == N

        # ********************************************************************************
        # Constraint: total_rows_processed = times_of_invocations * N
        # ********************************************************************************
        for i in range(np.random.randint(1, 100)):
            _layer.function(
                X,
                numexpr_enabled=numexpr_enabled,
            )
            assert _layer.total_rows_processed == TYPE_INT(N * (i + 2))

    profiler.disable()
    profiler.print_stats(sort="cumtime")
Example #8
0
def test_010_base_instantiation():
    """Test case for layer base class
    """

    # --------------------------------------------------------------------------------
    # name, num_nodes, log_level _init_ properties.
    # Logging debug outputs.
    # X setter/getter
    # T setter/getter
    # objective function setter/getter
    # function(x) repeats x.
    # gradient(dL/dY) repeats dL/dY,
    # gradient_numerical() returns 1
    # --------------------------------------------------------------------------------
    def objective(X: np.ndarray) -> Union[TYPE_FLOAT, np.ndarray]:
        """Dummy objective function"""
        return np.sum(X)

    N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
    M: int = np.random.randint(1, NUM_MAX_NODES)
    name = "test_010_base"
    _layer: Layer = Layer(name=name, num_nodes=M, log_level=logging.DEBUG)

    # --------------------------------------------------------------------------------
    # Properties
    # --------------------------------------------------------------------------------
    assert _layer.name == name
    assert _layer.num_nodes == _layer.M == M

    _layer._D = 1
    assert _layer.D == 1

    X = np.random.randn(N, M).astype(TYPE_FLOAT)
    _layer.X = X
    assert np.array_equal(_layer.X, X), \
        "Expected:\n%s\nDiff\n%s\n" % (X, (_layer.X-X))
    assert _layer.N == N

    _layer._dX = X
    assert np.array_equal(_layer.dX, X)

    T = np.random.randint(0, M, N).astype(TYPE_LABEL)
    _layer.T = T
    assert np.array_equal(_layer.T, T)

    _layer._Y = np.dot(X, X.T)
    assert np.array_equal(_layer.Y, np.dot(X, X.T))

    _layer._dY = np.array(0.9, dtype=TYPE_FLOAT)
    assert _layer._dY == np.array(0.9, dtype=TYPE_FLOAT)

    _layer.logger.debug("This is a pytest")

    # --------------------------------------------------------------------------------
    # Methods
    # --------------------------------------------------------------------------------
    try:
        # pylint: disable=not-callable
        _layer.function(TYPE_INT(1))
        raise RuntimeError("Invoke layer.function(int(1)) must fail.")
    except AssertionError:
        pass

    x = np.array(1.0, dtype=TYPE_FLOAT)
    assert np.array_equal(_layer.function(x), x)

    try:
        Y = _layer.function(X)
        assert Y.ndim > 0
        _layer.gradient(int(1))
        raise RuntimeError("Invoke layer.gradient(int(1)) must fail.")
    except AssertionError:
        pass

    Y = _layer.function(X)
    assert np.array_equal(_layer.gradient(Y), Y)

    _layer.objective = objective
    # pylint: disable=not-callable
    assert np.array_equal(_layer.objective(X), objective(X))
Example #9
0
 def stride(self) -> TYPE_INT:
     """Length of preceding and succeeding context"""
     return TYPE_INT((self.window_size - self.event_size) / 2)
Example #10
0
 def vocabulary_size(self) -> TYPE_INT:
     """Number of unique events in the vocabulary"""
     return TYPE_INT(len(self.vocabulary))
Example #11
0
def test_word2vec():
    USE_TEXT8 = False
    USE_PTB = not USE_TEXT8

    # --------------------------------------------------------------------------------
    # text8 is one gigantic line having millions of words, which cannot fit in a vector.
    # Need to split into N words per line.
    # cat text8 | xargs -n $N > text8_$N
    # --------------------------------------------------------------------------------
    CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb"
    CORPUS_URL = "https://data.deepai.org/text8.zip" \
        if USE_TEXT8 else 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.test.txt'

    TARGET_SIZE = TYPE_INT(1)  # Size of the target event (word)
    CONTEXT_SIZE = TYPE_INT(
        6)  # Size of the context in which the target event occurs.
    WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
    SAMPLE_SIZE = TYPE_INT(6)  # Size of the negative samples
    VECTOR_SIZE = TYPE_INT(100)  # Number of features in the event vector.

    WEIGHT_SCHEME = "normal"
    WEIGHT_PARAMS = {"std": 0.01}
    LR = TYPE_FLOAT(20.0)

    # For text8_512 where one line has 512 words, the number of sentences to take in is small.
    # For ptb_train where one line may have only a few words, take in more lines to avoid a
    # (event, context) pair where event is an invalid padding word.
    # Because of padding, when there are only a few sentence provided,
    # the 'event' in (event,context) can result in 0.
    # [[562  58 117   0   0   0   0   0   0   0   0]]
    NUM_SENTENCES = 1 if USE_TEXT8 else 10

    STATE_FILE = \
        "/home/oonisim/home/repository/git/oonisim/python_programs/nlp/models/" \
        "word2vec_sgram_%s_E%s_C%s_S%s_W%s_%s_%s_V%s_LR%s_N%s.pkl" % (
            CORPUS_FILE,
            TARGET_SIZE,
            CONTEXT_SIZE,
            SAMPLE_SIZE,
            WEIGHT_SCHEME,
            "std",
            WEIGHT_PARAMS["std"],
            VECTOR_SIZE,
            LR,
            NUM_SENTENCES,
        )

    MAX_ITERATIONS = 30

    # --------------------------------------------------------------------------------
    # Corpus text
    # --------------------------------------------------------------------------------
    path_to_corpus = f"{str(Path.home())}/.keras/datasets/{CORPUS_FILE}"
    if fileio.Function.is_file(path_to_corpus):
        pass
    else:
        path_to_corpus = tf.keras.utils.get_file(fname=CORPUS_FILE,
                                                 origin=CORPUS_URL,
                                                 extract=True)
    corpus = fileio.Function.read_file(path_to_corpus)

    # --------------------------------------------------------------------------------
    # Logistic Log Loss
    # --------------------------------------------------------------------------------
    loss = CrossEntropyLogLoss(
        name="loss",
        num_nodes=1,  # Logistic log loss
        log_loss_function=sigmoid_cross_entropy_log_loss)

    # --------------------------------------------------------------------------------
    # Event indexing
    # --------------------------------------------------------------------------------
    word_indexing = EventIndexing(name="word_indexing",
                                  corpus=corpus,
                                  min_sequence_length=WINDOW_SIZE)
    del corpus

    # --------------------------------------------------------------------------------
    # Event Context
    # --------------------------------------------------------------------------------
    event_context = EventContext(name="ev",
                                 window_size=WINDOW_SIZE,
                                 event_size=TARGET_SIZE)

    # --------------------------------------------------------------------------------
    # Event Embedding
    # --------------------------------------------------------------------------------
    embedding: Embedding = Embedding(
        name="embedding",
        num_nodes=WINDOW_SIZE,
        target_size=TARGET_SIZE,
        context_size=CONTEXT_SIZE,
        negative_sample_size=SAMPLE_SIZE,
        event_vector_size=VECTOR_SIZE,
        optimizer=SGD(lr=LR),
        dictionary=word_indexing,
        weight_initialization_scheme=WEIGHT_SCHEME,
        weight_initialization_parameters=WEIGHT_PARAMS)

    # --------------------------------------------------------------------------------
    # Adapter between Embedding and Log Loss
    # --------------------------------------------------------------------------------
    adapter_function = embedding.adapt_function_to_logistic_log_loss(loss=loss)
    adapter_gradient = embedding.adapt_gradient_to_logistic_log_loss()
    adapter: Adapter = Adapter(
        name="adapter",
        num_nodes=TYPE_INT(1),  # Number of output M=1
        function=adapter_function,
        gradient=adapter_gradient)

    # --------------------------------------------------------------------------------
    # Network
    # --------------------------------------------------------------------------------
    network = SequentialNetwork(
        name="word2vec",
        num_nodes=1,
        inference_layers=[word_indexing, event_context, embedding, adapter],
        objective_layers=[loss])

    def sentences_generator(path_to_file, num_sentences):
        stream = fileio.Function.file_line_stream(path_to_file)
        try:
            while True:
                yield np.array(fileio.Function.take(num_sentences, stream))
        finally:
            stream.close()

    # Restore the state if exists.
    if fileio.Function.is_file(STATE_FILE):
        state = embedding.load(STATE_FILE)

        fmt = """Model loaded.
        event_size %s
        context_size: %s
        event_vector_size: %s
        """
        print(fmt % (state["target_size"], state["context_size"],
                     state["event_vector_size"]))
    else:
        print("State file does not exist. Saving the initial model %s." %
              STATE_FILE)
        embedding.save(STATE_FILE)

    # Continue training
    profiler = cProfile.Profile()
    profiler.enable()

    total_sentences = 0
    epochs = 0
    source = sentences_generator(path_to_file=path_to_corpus,
                                 num_sentences=NUM_SENTENCES)

    for i in range(MAX_ITERATIONS):
        try:
            sentences = next(source)
            total_sentences += len(sentences)

            start = time.time()
            network.train(X=sentences, T=np.array([0]))

            if i % 100 == 0:
                print(f"Batch {i:05d} of {NUM_SENTENCES} sentences: "
                      f"Average Loss: {np.mean(network.history):10f} "
                      f"Duration {time.time() - start:3f}")
            if i % 10000 == 0:
                embedding.save(STATE_FILE)
                pass

        except fileio.Function.GenearatorHasNoMore as e:
            source.close()
            embedding.save(STATE_FILE)

            # Next epoch
            print(f"epoch {epochs} batches {i:05d} done")
            epochs += 1
            source = sentences_generator(path_to_file=path_to_corpus,
                                         num_sentences=NUM_SENTENCES)

        except Exception as e:
            print("Unexpected error:", sys.exc_info()[0])
            source.close()
            raise e

    embedding.save(STATE_FILE)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_embedding_save_load(caplog):
    """
    Objective:
        Verify the save/load function of EventIndexing
    Expected:
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_save_load"
    sentences = """
    Verify the gradient descent, especially np.ufunc.at, is working as expected 
    """

    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(10):
        # First validate the correct configuration, then change parameter one by one.
        target_size = TYPE_INT(np.random.randint(1, 2))
        context_size = TYPE_INT(2 * np.random.randint(1, 2))
        negative_sample_size = TYPE_INT(np.random.randint(1, 2))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(2, 3))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )

        # --------------------------------------------------------------------------------
        # Run methods and save the results to compare later
        # --------------------------------------------------------------------------------
        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)
        Y1 = embedding.function(target_context_pairs)

        # --------------------------------------------------------------------------------
        # Save the layer state and invalidate the state variables
        # --------------------------------------------------------------------------------
        tester = testing.layer.Function(instance=embedding)
        tester.test_save()
        backup_W = copy.deepcopy(embedding.W)
        embedding._W = np.empty(0)

        # --------------------------------------------------------------------------------
        # Confirm the layer does not function anymore
        # --------------------------------------------------------------------------------
        try:
            embedding.function(target_context_pairs)
            raise RuntimeError("Must fail with state deleted")
        except Exception as e:
            pass

        # --------------------------------------------------------------------------------
        # Restore the state and confirm the layer functions as expected.
        # Because of random negative sampling, the result of context part differs every time.
        # Hence only true label (target) part can be the same.
        # --------------------------------------------------------------------------------
        try:
            # Constraint:
            #   Layer works after state reloaded
            tester.test_load()
            assert np.allclose(backup_W, embedding.W)
            Y2 = embedding.function(target_context_pairs)
            assert \
                np.array_equal(Y1[::, 0], Y2[::,0]), \
                "Expected Y\n%s\nActual\n%s\ndiff\n%s\n" \
                % (Y1[::, 0], Y2[::,0], (Y1[::,0]-Y2[::,0]))

        except Exception as e:
            raise e
        finally:
            tester.clean()

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_loss_adapter_function_to_fail(caplog):
    """
    Objective:
        Verify the Adapter function invalidate the shape other than
        - Y:(N, 1+SL)
        - ys:(N,SL)
        - ye:(N,1)
    Expected:
        Adapter.function(Y) to fail when Y shape is not (N, 1+SL)
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_adapt_embedding_logistic_loss_function_multi_lines"

    sentences = """
    Verify the EventIndexing function can handle multi line sentences
    the asbestos fiber <unk> is unusually <unk> once it enters the <unk> 
    with even brief exposures to it causing symptoms that show up decades later researchers said
    """

    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        E = target_size = TYPE_INT(np.random.randint(1, 3))
        C = context_size = TYPE_INT(2 * np.random.randint(1, 5))
        SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size,
                                         event_vector_size)

        loss, adapter, embedding, event_context = _instantiate(
            name=name,
            num_nodes=TYPE_INT(1),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
        )

        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)
        Y = embedding.function(target_context_pairs)
        N, _ = embedding.tensor_shape(Y)

        # ********************************************************************************
        # Constraint: Y in shape:(N,M) where M > SL+1 must fail
        # ********************************************************************************
        shape = (N, 1 + SL + np.random.randint(1, 100))
        msg = "Y shape %s which is not the expected shape %s must fail" % \
              (shape, (N, SL+1))
        dummy_Y = np.random.uniform(size=shape).astype(TYPE_FLOAT)
        _function_must_fail(adapter=adapter, Y=dummy_Y, msg=msg)

        # ********************************************************************************
        # Constraint: Y in shape (N+,) must fail.
        # Adapter function can accept (N,) but not (N+,)
        # ********************************************************************************
        shape = (N + np.random.randint(1, 100), )
        msg = "Y shape %s which is not the expected shape %s must fail" % \
              (shape, (N,))
        dummy_Y = np.random.uniform(size=shape).astype(TYPE_FLOAT)
        _function_must_fail(adapter=adapter, Y=dummy_Y, msg=msg)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_embedding_gradient_vs_autodiff(caplog):
    """
    Objective:
        Verify the Embedding analytical gradient with TF autodiff implemented
        in the gradient_numerical() method of the layer.
    Expected:
        Gradients [dWe, dWs, dWc] calculated in gradient() method matches with
        those calculated in the gradient_numerical().
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_gradient_multi_lines"
    dictionary: EventIndexing = _instantiate_event_indexing()

    from function import text
    from . test_020_embedding_sample_sentences import (
        bbc_world_us_canada_56988381 as sentences
    )
    max_sentence_length = TYPE_INT(text.Function.max_sentence_length(sentences))
    assert max_sentence_length >= 3

    profiler = cProfile.Profile()
    profiler.enable()

    def L(x):
        loss = Function.sum(
            x, axis=None, keepdims=False
        )
        return loss

    # --------------------------------------------------------------------------------
    # Ye = einsum("nd,ncd->n",  Bc:(N,D), We:(N,E,D))
    # dL/dWe:(N,E,D) = dL/dYe * dYe/dWe = dL/dYe * Bc
    #
    # Ys = einsum("nd,nsd->ns", Bc:(N,D), Ws:(N,SL,D))
    # dL/dWs:(N,SL,D) = dL/dYs * dYs/dWs = dL/dYs * Bc
    #
    # By setting
    # 1. dL/dY = np.c_[dL/dYe,dL/dYs] = I and
    # 2. context_size C == negative_sample_size SL
    # The constraint is E * dL/dWe == dL/dWs == Bc because dL/dYe, dL/dYs are I.
    # dL/dWe is normalized with E to be independent from the event (target) size.
    # --------------------------------------------------------------------------------
    for _ in range(NUM_MAX_TEST_TIMES):
        # C must be even number
        C = TYPE_INT(np.random.randint(1, max_sentence_length / 2) * 2)
        assert C < max_sentence_length

        # E=SL for (N,E,D) and (N,SL,D) has the same shape
        E = SL = TYPE_INT(
            np.random.randint(
                1,
                min(
                    Embedding.MAX_TARGET_SIZE,
                    Embedding.MAX_NEGATIVE_SAMPLE_SIZE,
                    (max_sentence_length - C)
                )+1
            )
        )

        target_size = negative_sample_size = TYPE_INT(E)
        context_size = TYPE_INT(C)
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(1, 100))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )
        embedding.objective = L
        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)

        # --------------------------------------------------------------------------------
        # Forward path
        # --------------------------------------------------------------------------------
        Y = embedding.function(target_context_pairs)
        EDWe, EDWs, EDWc = embedding.gradient_numerical()

        # --------------------------------------------------------------------------------
        # Backward path
        # --------------------------------------------------------------------------------
        dY = Function.ones(shape=Function.tensor_shape(Y))
        embedding.gradient(dY)

        # --------------------------------------------------------------------------------
        # Backward path
        # --------------------------------------------------------------------------------
        dWe, dWs, dWc = embedding.update()

        # ********************************************************************************
        # Constraint:
        # - dW is close to EDW
        # - dL/dWe = dL/dWs = Bc when dL/dY = I
        # ********************************************************************************
        assert Function.all_close(
            EDWe, dWe
        ), "Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe-dWe)
        assert Function.all_close(
            EDWs, dWs
        ), "Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs-dWs)
        assert Function.all_close(
            EDWc, dWc
        ), "Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc-dWc)
        assert Function.all_close(
            dWe * E, dWs
        ), "Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe-dWs)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_embedding_instance_properties_access_to_succeed(caplog):
    """
    Objective:
        Verify the layer class validates the parameters have been initialized before accessed.
    Expected:
        Initialization detects the access to the initialized parameters and succeeds.
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_instance_properties_access_to_succeed"
    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()
    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        target_size = TYPE_INT(np.random.randint(1, 10))
        context_size = TYPE_INT(2 * np.random.randint(1, 10))
        negative_sample_size = TYPE_INT(np.random.randint(5, 20))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))
        W: TYPE_TENSOR = np.random.rand(dictionary.vocabulary_size, event_vector_size).astype(TYPE_FLOAT)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )

        # --------------------------------------------------------------------------------
        # To pass
        # --------------------------------------------------------------------------------
        try:
            if not embedding.name == name:
                raise RuntimeError("embedding.name == name should be true")
        except AssertionError:
            raise RuntimeError("Access to name should be allowed as already initialized.")

        try:
            if not embedding.V == dictionary.vocabulary_size:
                raise RuntimeError("embedding.V == vocabulary_size should be true")
        except AssertionError:
            raise RuntimeError("Access to V should be allowed as already initialized.")

        try:
            if not embedding.E == target_size:
                raise RuntimeError("embedding.V == target_size should be true")
        except AssertionError:
            raise RuntimeError("Access to E should be allowed as already initialized.")

        try:
            if not embedding.C == context_size:
                raise RuntimeError("embedding.C == context_size should be true")
        except AssertionError:
            raise RuntimeError("Access to C should be allowed as already initialized.")

        try:
            if not embedding.window_size == target_size+context_size:
                raise RuntimeError("embedding.window_size == target_size+context_size should be true")
        except AssertionError:
            raise RuntimeError("Access to window_size should be allowed as already initialized.")

        try:
            if not embedding.SL == negative_sample_size:
                raise RuntimeError("embedding.negative_sample_size == negative_sample_size should be true")
        except AssertionError:
            raise RuntimeError("Access to negative_sample_size should be allowed as already initialized.")

        try:
            if embedding.dictionary is not dictionary:
                raise RuntimeError("embedding.dictionary is dictionary should be true")
        except AssertionError:
            raise RuntimeError("Access to dictionary should be allowed as already initialized.")

        try:
            # Embedding internally deepcopy W to avoid unexpected change and
            # event vector for UNK and NIL are zero cleared.
            if not np.array_equal(
                embedding.W[len(EVENT_META_ENTITIES):],
                W[len(EVENT_META_ENTITIES):]
            ):
                raise RuntimeError("np.array_equal(embedding.W, W) should be true")
        except AssertionError:
            raise RuntimeError("Access to W should be allowed as already initialized.")

        try:
            if not isinstance(embedding.optimizer, SGD):
                raise RuntimeError("isinstance(embedding.optimizer, SGD) should be true")
        except AssertionError:
            raise RuntimeError("Access to optimizer should be allowed as already initialized.")

        try:
            opt = SGD()
            if not embedding.lr == opt.lr:
                raise RuntimeError("embedding.lr == lr should be true")
        except AssertionError:
            raise RuntimeError("Access to lr should be allowed as already initialized.")

        try:
            opt = SGD()
            if not embedding.l2 == opt.l2:
                raise RuntimeError("embedding.l2 == context_size should be true")
        except AssertionError:
            raise RuntimeError("Access to l2 should be allowed as already initialized.")

        try:
            if not np.array_equal(embedding.S["target_size"], embedding.E):
                raise RuntimeError("embedding.E == E should be true")
        except AssertionError:
            raise RuntimeError("Access to S['target_size'] should be allowed as already initialized.")

        try:
            if not np.array_equal(embedding.S["context_size"], embedding.C):
                raise RuntimeError("embedding.C == C should be true")
        except AssertionError:
            raise RuntimeError("Access to S['context_size'] should be allowed as already initialized.")

        try:
            if not np.array_equal(embedding.S["negative_sample_size"], embedding.SL):
                raise RuntimeError("embedding.SL == SL should be true")
        except AssertionError:
            raise RuntimeError("Access to S['negative_sample_size'] should be allowed as already initialized.")

        try:
            if not np.array_equal(embedding.S["W"], embedding.W):
                raise RuntimeError("embedding.W == W should be true")
        except AssertionError:
            raise RuntimeError("Access to S['W'] should be allowed as already initialized.")

        try:
            if not isinstance(embedding.logger, logging.Logger):
                raise RuntimeError("isinstance(embedding.logger, logging.Logger) should be true")
        except AssertionError:
            raise RuntimeError("Access to logger should be allowed as already initialized.")

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_logistic_loss_instantiation_to_fail():
    """
    Objective:
        Verify the layer class validates the initialization parameter constraints.
    Expected:
        Initialization detects parameter constraints not meet and fails.
    """
    name = "test_020_adapt_embedding_logistic_loss_instantiation_to_fail"

    # First validate the correct configuration, then change parameter one by one.
    dictionary: EventIndexing = _instantiate_event_indexing()
    target_size = TYPE_INT(np.random.randint(1, 10))
    context_size = TYPE_INT(2 * np.random.randint(1, 10))
    negative_sample_size = TYPE_INT(np.random.randint(5, 20))
    event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))
    W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size,
                                     event_vector_size)

    _instantiate(
        name=name,
        num_nodes=TYPE_INT(1),
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        W=W,
        log_level=logging.DEBUG,
    )

    profiler = cProfile.Profile()
    profiler.enable()
    for _ in range(NUM_MAX_TEST_TIMES):
        target_size = TYPE_INT(np.random.randint(1, 10))
        context_size = TYPE_INT(2 * np.random.randint(1, 10))
        negative_sample_size = TYPE_INT(np.random.randint(5, 20))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))

        msg = "Name is string with length > 0."
        _instantiate_must_fail(name="",
                               num_nodes=(1 + negative_sample_size),
                               target_size=target_size,
                               context_size=context_size,
                               negative_sample_size=negative_sample_size,
                               event_vector_size=event_vector_size,
                               dictionary=dictionary,
                               log_level=logging.DEBUG,
                               msg=msg)

        msg = "num_nodes must > 0."
        _instantiate_must_fail(name=name,
                               num_nodes=TYPE_INT(0),
                               target_size=target_size,
                               context_size=context_size,
                               negative_sample_size=negative_sample_size,
                               event_vector_size=event_vector_size,
                               dictionary=dictionary,
                               log_level=logging.DEBUG,
                               msg=msg)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_embedding_instance_properties_access_to_fail(caplog):
    """
    Objective:
        Verify the layer class validates the parameters have been initialized before accessed.
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_instance_properties_access_to_fail"
    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        target_size = TYPE_INT(np.random.randint(1, 10))
        context_size = TYPE_INT(2 * np.random.randint(1, 10))
        negative_sample_size = TYPE_INT(np.random.randint(5, 20))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )

        # --------------------------------------------------------------------------------
        # To fail
        # --------------------------------------------------------------------------------
        msg = "Accessing uninitialized property of the layer must fail."
        try:
            print(embedding.X)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(embedding.N)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(embedding.dX)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(embedding.dW)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(embedding.Y)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(embedding.dY)
            raise RuntimeError(msg)
        except AssertionError:
            pass

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_embedding_instantiation_to_fail():
    """
    Objective:
        Verify the layer class validates the initialization parameter constraints.
    Expected:
        Initialization detects parameter constraints not meet and fails.
    """
    name = "test_020_embedding_instantiation_to_fail"

    # First validate the correct configuration, then change parameter one by one.
    dictionary: EventIndexing = _instantiate_event_indexing()
    target_size = TYPE_INT(np.random.randint(1, 10))
    context_size = TYPE_INT(2 * np.random.randint(1, 10))
    negative_sample_size = TYPE_INT(np.random.randint(5, 20))
    event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))
    W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

    _must_succeed(
        name=name,
        num_nodes=TYPE_INT(1),
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        W=W,
        log_level=logging.DEBUG,
        msg="must succeed"
    )

    _must_succeed(
        name=name,
        num_nodes=(1+negative_sample_size),
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        log_level=logging.DEBUG,
        msg="must succeed without W"
    )

    profiler = cProfile.Profile()
    profiler.enable()
    for _ in range(NUM_MAX_TEST_TIMES):
        target_size = TYPE_INT(np.random.randint(1, 10))
        context_size = TYPE_INT(2 * np.random.randint(1, 10))
        negative_sample_size = TYPE_INT(np.random.randint(5, 20))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100))

        msg = "Name is string with length > 0."
        _must_fail(
            name="",
            num_nodes=(1 + negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "num_nodes must > 0."
        _must_fail(
            name=name,
            num_nodes=TYPE_INT(0),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        # msg = "num_nodes is 1+negative_sample_size but does not has to be enforced"
        # _must_fail(
        #     name=name,
        #     num_nodes=TYPE_INT(1+negative_sample_size),
        #     target_size=target_size,
        #     context_size=context_size,
        #     negative_sample_size=negative_sample_size,
        #     event_vector_size=event_vector_size,
        #     dictionary=dictionary,
        #     log_level=logging.DEBUG,
        #     msg=msg
        # )

        msg = "target size must be >0."
        _must_fail(
            name=name,
            num_nodes=(1 + negative_sample_size),
            target_size=TYPE_INT(0),
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "context_size must be >0."
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=TYPE_INT(0),
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "negative_sample_size must be >0."
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=TYPE_INT(0),
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "event_vector_size must be >0."
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=TYPE_INT(0),
            dictionary=dictionary,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "dictionary must be of type EventIndexing"
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=["hoge", None][np.random.randint(0, 2)],
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "vocabulary_size must >= " \
              "(context_size + target_size) + " \
              "negative_sample_size + " \
              "len(EVENT_META_ENTITIES)"
        length = (context_size + target_size) + negative_sample_size
        corpus = " ".join(str(i) for i in range(length))
        _indexing_dummy = EventIndexing(
            name=__name__,
            num_nodes=1,
            corpus=corpus
        )
        assert _indexing_dummy.vocabulary_size == length + len(EVENT_META_ENTITIES)
        _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=TYPE_INT(target_size),
            context_size=TYPE_INT(context_size),
            negative_sample_size=TYPE_INT(negative_sample_size),
            event_vector_size=TYPE_INT(event_vector_size),
            dictionary=_indexing_dummy,
            log_level=logging.DEBUG,
            msg=msg
        )
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=TYPE_INT(target_size) + 1,
            context_size=TYPE_INT(context_size),
            negative_sample_size=TYPE_INT(negative_sample_size),
            event_vector_size=TYPE_INT(event_vector_size),
            dictionary=_indexing_dummy,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "vocabulary_size must be > W.shape[0]"
        w = np.random.randn(
            dictionary.vocabulary_size - np.random.randint(1, dictionary.vocabulary_size),
            event_vector_size
        )
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=TYPE_INT(target_size),
            context_size=TYPE_INT(context_size),
            negative_sample_size=TYPE_INT(negative_sample_size),
            event_vector_size=TYPE_INT(event_vector_size),
            dictionary=dictionary,
            W=w,
            log_level=logging.DEBUG,
            msg=msg
        )

        msg = "event_vector_size must be W.shape[1]"
        offset = np.random.randint(1, event_vector_size) * (1 if random.random() < 0.5 else -1)
        assert (event_vector_size + offset) > 0, \
            "%s %s %s" % (event_vector_size, offset, (event_vector_size + offset))
        w = np.random.randn(
            dictionary.vocabulary_size,
            (event_vector_size + offset)
        )
        _must_fail(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=TYPE_INT(target_size),
            context_size=TYPE_INT(context_size),
            negative_sample_size=TYPE_INT(negative_sample_size),
            event_vector_size=TYPE_INT(event_vector_size),
            dictionary=dictionary,
            W=w,
            log_level=logging.DEBUG,
            msg=msg
        )

    profiler.disable()
    profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_loss_adapter_function_ye_to_succeed(caplog):
    """
    Objective:
        Verify the Adapter function handles Y in shape
        - Y:(N, 1+SL)
        - ys:(N,SL)
        - ye:(N,1)
    Expected:
        Adapter.function(Y) returns
        - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1).
          Log loss T is set to the same shape

        - For Y:(N, SL), the return is in shape (N*SL,1).
          Log loss T is set to the same shape

        - For Y:(N,), the return is in shape (N,1).
          Log loss T is set to the same shape
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_adapt_embedding_logistic_loss_function_multi_lines"

    sentences = """
    Verify the EventIndexing function can handle multi line sentences
    the asbestos fiber <unk> is unusually <unk> once it enters the <unk> 
    with even brief exposures to it causing symptoms that show up decades later researchers said
    """

    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        E = target_size = TYPE_INT(np.random.randint(1, 3))
        C = context_size = TYPE_INT(2 * np.random.randint(1, 5))
        SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size,
                                         event_vector_size)

        loss, adapter, embedding, event_context = _instantiate(
            name=name,
            num_nodes=TYPE_INT(1),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
        )

        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)
        Y = embedding.function(target_context_pairs)
        N, _ = embedding.tensor_shape(Y)

        # ********************************************************************************
        # Constraint:
        # - Adapter function returns (N,1) with the same values of ye
        # - Adapter function has set T:(N,1) with value 1 in the loss layer
        # ********************************************************************************
        msg = "ye must succeed"
        ye = Y[::, 0]
        EZ = expected_Z = embedding.reshape(ye, shape=(N, 1))
        Z = _function_must_succeed(adapter=adapter, Y=ye, msg=msg)
        assert embedding.all_close(
            Z, EZ,
            "Z must close to EZ. Z:\n%s\nEZ\n%s\nDiff\n%s\n" % (Z, EZ,
                                                                (EZ - Z)))
        T = np.ones(shape=(N, 1), dtype=TYPE_LABEL)
        assert embedding.all_equal(T, loss.T), \
            "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T)

    profiler.disable()
    profiler.print_stats(sort="cumtime")
Example #20
0
def test_020_cross_entropy_log_loss_1d(caplog):
    """
    Objective:
        Test the categorical log loss values for P in 1 dimension.

    Constraints:
        1. The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h.
        2. The numerical gradient gn is within +/- u within the analytical g = -T/P.

    P: Probabilities from softmax of shape (M,)
    M: Number of nodes in the cross_entropy_log_loss layer.
    T: Labels

    Note:
        log(P=1) -> 0
        dlog(x)/dx = 1/x
    """
    def f(P: np.ndarray, T: np.ndarray):
        return np.sum(cross_entropy_log_loss(P, T))

    # caplog.set_level(logging.DEBUG, logger=Logger.name)

    h: TYPE_FLOAT = OFFSET_DELTA
    u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE

    # --------------------------------------------------------------------------------
    # For (P, T): P[index] = True/1, OHE label T[index] = 1 where
    # P=[0,0,0,...,1,...0], T = [0,0,0,...1,...0]. T[i] == 1
    #
    # Do not forget the Jacobian shape is (N,) and calculate each element.
    # 1. For T=1, loss L = -log(Pi) = 0 and dL/dP=(1/Pi)= -1 is expected.
    # 2. For T=0, Loss L = (-log(0+offset+h)-log(0+offset-h)) / 2h = 0 is expected.
    # --------------------------------------------------------------------------------
    M: TYPE_INT = np.random.randint(2, NUM_MAX_NODES)
    index: TYPE_INT = TYPE_INT(np.random.randint(
        0, M))  # Position of the true label in P
    P1 = np.zeros(M, dtype=TYPE_FLOAT)
    P1[index] = TYPE_FLOAT(1.0)
    T1 = np.zeros(M, dtype=TYPE_LABEL)
    T1[index] = TYPE_LABEL(1)

    # Analytica correct gradient for P=1, T=1
    AG = np.zeros_like(P1, dtype=TYPE_FLOAT)
    AG[index] = TYPE_FLOAT(-1)  # dL/dP = -1

    EGN1 = np.zeros_like(P1, dtype=TYPE_FLOAT)  # Expected numerical gradient
    EGN1[index] = (-1 * logarithm(TYPE_FLOAT(1.0 + h)) + TYPE_FLOAT(1) *
                   logarithm(TYPE_FLOAT(1.0 - h))) / TYPE_FLOAT(2 * h)
    assert np.all(np.abs(EGN1-AG) < u), \
        "Expected EGN-1<%s but %s\nEGN=\n%s" % (u, (EGN1-AG), EGN1)

    GN1 = numerical_jacobian(partial(f, T=T1), P1)
    assert np.all(np.abs(GN1-AG) < u), \
        "Expected GN-1<%s but %s\nGN=\n%s" % (u, (GN1-AG), GN1)

    # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h
    assert GN1.shape == EGN1.shape
    assert np.all(np.abs(EGN1-GN1) < u), \
        "Expected GN1==EGN1 but GN1-EGN1=\n%sP=\n%s\nT=%s\nEGN=\n%s\nGN=\n%s\n" \
        % (np.abs(GN1-EGN1), P1, T1, EGN1, GN1)

    # The numerical gradient gn is within +/- u within the analytical g = -T/P
    G1 = np.zeros_like(P1, dtype=TYPE_FLOAT)
    G1[T1 == 1] = -1 * (T1[index] / P1[index])
    # G1[T1 != 0] = 0
    check.equal(np.all(np.abs(G1 - GN1) < u), True,
                "G1-GN1 %s\n" % np.abs(G1 - GN1))

    # --------------------------------------------------------------------------------
    # For (P, T): P[index] = np uniform(), index label T=index
    # --------------------------------------------------------------------------------
    for _ in range(NUM_MAX_TEST_TIMES):
        M = np.random.randint(2, NUM_MAX_NODES)  # M > 1
        T2 = TYPE_LABEL(np.random.randint(0, M))  # location of the truth
        P2 = np.zeros(M, dtype=TYPE_FLOAT)
        while not (x := TYPE_FLOAT(
                np.random.uniform(low=-BOUNDARY_SIGMOID,
                                  high=BOUNDARY_SIGMOID))):
            pass
        p = softmax(x)
        P2[T2] = p

        # --------------------------------------------------------------------------------
        # The Jacobian G shape is the same with P.shape.
        # G:[0, 0, ...,g, 0, ...] where Gi is numerical gradient close to -1/(1+k).
        # --------------------------------------------------------------------------------
        N2 = np.zeros_like(P2, dtype=TYPE_FLOAT)
        N2[T2] = TYPE_FLOAT(-1) * (logarithm(p + h) -
                                   logarithm(p - h)) / TYPE_FLOAT(2 * h)
        N2 = numerical_jacobian(partial(f, T=T2), P2)

        # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h
        assert N2.shape == N2.shape
        assert np.all(np.abs(N2-N2) < u), \
            f"Delta expected to be < {u} but \n{np.abs(N2-N2)}"

        G2 = np.zeros_like(P2, dtype=TYPE_FLOAT)
        G2[T2] = -1 / p

        # The numerical gradient gn is within +/- u within the analytical g = -T/P
        check.equal(np.all(np.abs(G2 - N2) < u), True,
                    "G2-N2 %s\n" % np.abs(G2 - N2))
def test_020_embedding_gradient_descent(caplog):
    """
    Objective:
        Verify the gradient descent, especially np.ufunc.at, is working as expected.

        W:(V, D=3) where all elements are initialized to 1.0.
        dL/dY:(1,E+SL) = I and E=1,SL=1

        X=(target,context)=[3,4,5,6,5] where target_index=3.

    Expected:
        The context index 5 occurs twice so that W[5] should be updated twice
        at the gradient descent as W[5] = W[5] - [lr * (1 + l2)] * 2.
        For other context indices at 4, 6:
        W[3] = W[3] - [lr * (1 + l2) * dWe].
        W[4] = W[4] - [lr * (1 + l2) * dWc].
        W[6] = W[6] - [lr * (1 + l2) * dWc].
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_gradient_multi_lines"
    dictionary: EventIndexing = _instantiate_event_indexing()

    def L(x):
        loss = Function.sum(
            x, axis=None, keepdims=False
        )
        return loss

    target_size = negative_sample_size = TYPE_INT(1)
    context_size = TYPE_INT(4)
    event_vector_size = TYPE_INT(3)
    W: TYPE_TENSOR = np.ones(
        shape=(dictionary.vocabulary_size, event_vector_size),
        dtype=TYPE_FLOAT
    )

    embedding, event_context = _must_succeed(
        name=name,
        num_nodes=(1 + negative_sample_size),
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        W=W,
        log_level=logging.DEBUG,
        msg="must succeed"
    )
    del W   # embedding deepcopy W to avoid unexpected changes
    embedding.objective = L

    target_context_pairs = np.array([[3, 4, 5, 6, 5]], dtype=TYPE_INT)

    # --------------------------------------------------------------------------------
    # Forward path
    # --------------------------------------------------------------------------------
    Y = embedding.function(target_context_pairs)
    EDWe, EDWs, EDWc = embedding.gradient_numerical()
    print(f"Loss {L(Y)}\n")

    # --------------------------------------------------------------------------------
    # Backward path
    # --------------------------------------------------------------------------------
    dY = Function.ones(shape=Function.tensor_shape(Y))
    embedding.gradient(dY)

    # --------------------------------------------------------------------------------
    # Expected We, Wc (we do not know Ws as negative sample is stochastic)
    # This is for SGD as the optimizer.
    # --------------------------------------------------------------------------------
    lr = embedding.lr
    l2 = embedding.l2

    expected_dWe = lr * (1+l2) * embedding.dWe
    diff_We = embedding.optimizer.differential(dW=embedding.dWe)
    msg_We = "dWe: expected\n%s\n but actual diff=:\n%s\n" % \
             (expected_dWe, (expected_dWe-diff_We))
    embedding.all_close(
        expected_dWe, diff_We, msg=msg_We
    )
    EWe = embedding.W[3] - expected_dWe

    expected_dWc = lr * (1+l2) * embedding.dWc
    diff_Wc = embedding.optimizer.differential(dW=embedding.dWc)
    msg_Wc = "dWc: expected\n%s\n but actual diff=:\n%s\n" % \
             (expected_dWc, (expected_dWc-diff_Wc))
    embedding.all_close(
        expected_dWc, diff_Wc, msg=msg_Wc
    )
    EWc4 = np.subtract(embedding.W[4], expected_dWc)
    EWc5 = np.subtract(embedding.W[5], expected_dWc * 2)
    EWc6 = np.subtract(embedding.W[6], expected_dWc)

    # --------------------------------------------------------------------------------
    # Backward path: Gradient descent
    # --------------------------------------------------------------------------------
    assert np.array_equal(embedding.target_indices, np.array([3], dtype=TYPE_INT))
    assert np.array_equal(embedding.context_indices, np.array([4, 5, 6, 5], dtype=TYPE_INT))

    dWe, dWs, dWc = embedding.update()

    # ********************************************************************************
    # Constraint:
    # - dW is close to EDW
    # - dL/dWe = dL/dWs = Bc when dL/dY = I
    # ********************************************************************************
    assert Function.all_close(
        EDWe, dWe, msg="Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe - dWe)
    )
    assert Function.all_close(
        EDWs, dWs, msg="Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs - dWs)
    )
    assert Function.all_close(
        EDWc, dWc, msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc - dWc)
    )
    assert Function.all_close(
        dWe, dWs, msg="Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe - dWs)
    )

    # ********************************************************************************
    # Constraint:
    # ********************************************************************************
    assert np.array_equal(expected_dWe, lr * (1+l2) * dWe)
    assert np.array_equal(expected_dWc, lr * (1+l2) * dWc)

    # - W[3] = W[3] - [lr * (1 + l2) * dWe].
    assert Function.all_close(
        # EWe, embedding.W[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3])
        EWe, embedding.WO[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3])
    )
    # W[4] = W[4] - [lr * (1 + l2) * dWc]
    assert Function.all_close(
        EWc4, embedding.W[4], msg="Expected (EWc4==W[4])\n%s\ndifference\n%s\n" % (EWc4, EWc4 - embedding.W[4])
    )
    # W[5] = W[5] - [lr * (1 + l2) * 2 * dWc]
    assert Function.all_close(
        EWc5, embedding.W[5], msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EWc5, EWc5 - embedding.W[5])
    )
    # W[6] = W[6] - [lr * (1 + l2) * dWc]
    assert Function.all_close(
        EWc6, embedding.W[6], msg="Expected (EWc6==W[6])\n%s\ndifference\n%s\n" % (EWc6, EWc6 - embedding.W[6])
    )
Example #22
0
    def sentence_to_sequence(
        sentences: str,
        event_to_index: Dict[str, TYPE_INT],
        minimum_length: TYPE_INT = TYPE_INT(0)
    ) -> List[List[TYPE_INT]]:
        """Generate sequence of event indices from a text sentences
        1. Skip an empty line or a line only contain non-word e.g punctuation.
        2. Return [[]] if there is no sequence generated.
        3. Pad each sequence to the length of max(sequence_len, or min_length).

        Sentence length varies and a vectorization framework e.g. numpy may
        require same length rows, e.g. numpy array cannot handle ragged numeric
        rows. Hence pad a sequence to align, when minimum_length > 0.

        A sentence can be short e.g. "I am". To create (event, context) pair,
        minimum (event_length+context_length) is required. If a generated
        sequence length < minimum_length, then pad it to meet the min length.

        Args:
            sentences:
                A string including one ore more sentences to process.
                A sentence is delimited by EOL('\n').
            event_to_index: event to integer index mapping dictionary
            minimum_length: minimum length of a generated sequence.

        Returns: List of integer sequence per sentence
        """
        assert isinstance(sentences, str)

        sequences = []
        max_sequence_length = 0
        for line in sentences.split(EOL):
            if len(line.strip()) > 0:  # Skip empty line
                sequence = [
                    event_to_index.get(
                        w, EVENT_META_ENTITY_TO_INDEX[EVENT_UNK.lower()])
                    for w in Function.standardize(line).split()
                ]
                # A line may have punctuations only which results in null sequence
                if len(sequence) > 0:
                    max_sequence_length = max(max_sequence_length,
                                              len(sequence))
                    sequences.append(sequence)
            else:
                Logger.warning("Sentence is empty. Skipping...")

        if len(sequences) > 0:
            if minimum_length > 0:  # padding required
                max_sequence_length = max(max_sequence_length, minimum_length)
                padded: List[List[TYPE_INT]] = [
                    np.pad(array=seq,
                           pad_width=(0, max_sequence_length - len(seq)),
                           constant_values=EVENT_META_ENTITY_TO_INDEX[
                               EVENT_NIL.lower()]).astype(TYPE_INT).tolist()
                    for seq in sequences
                ]
                del sequences
            else:
                padded = sequences
        else:
            Logger.warning(
                "Return [[]] as no valid sentences in the input \n[%s]\n",
                sentences)
            padded = [[]]

        Logger.debug("Sequences generated for \n%s\n%s", sentences, padded)
        return padded
def test_020_adapt_embedding_loss_adapter_gradient_to_succeed(caplog):
    """
    Objective:
        Verify the Adapter gradient method handles dY in shape (N, 1+SL)

        Adapter.function(Y) returns
        - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1).
          Log loss T is set to the same shape

    Expected:
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_adapt_embedding_logistic_loss_function_multi_lines"

    sentences = """
    Verify the EventIndexing function can handle multi line sentences
    the asbestos fiber <unk> is unusually <unk> once it enters the <unk> 
    with even brief exposures to it causing symptoms that show up decades later researchers said
    """

    dictionary: EventIndexing = _instantiate_event_indexing()

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        # First validate the correct configuration, then change parameter one by one.
        E = target_size = TYPE_INT(np.random.randint(1, 3))
        C = context_size = TYPE_INT(2 * np.random.randint(1, 5))
        SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5))
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20))
        W: TYPE_TENSOR = np.random.rand(dictionary.vocabulary_size,
                                        event_vector_size)

        loss, adapter, embedding, event_context = _instantiate(
            name=name,
            num_nodes=TYPE_INT(1),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
        )

        # ================================================================================
        # Forward path
        # ================================================================================
        # --------------------------------------------------------------------------------
        # Event indexing
        # --------------------------------------------------------------------------------
        sequences = dictionary.function(sentences)

        # --------------------------------------------------------------------------------
        # Event context pairs
        # --------------------------------------------------------------------------------
        target_context_pairs = event_context.function(sequences)

        # --------------------------------------------------------------------------------
        # Embedding
        # --------------------------------------------------------------------------------
        Y = embedding.function(target_context_pairs)
        N, _ = embedding.tensor_shape(Y)
        batch_size = TYPE_FLOAT(N * (1 + SL))

        # --------------------------------------------------------------------------------
        # Adapter
        # --------------------------------------------------------------------------------
        Z = adapter.function(Y)

        # --------------------------------------------------------------------------------
        # Loss
        # --------------------------------------------------------------------------------
        L = loss.function(Z)

        # ********************************************************************************
        # Constraint:
        #   loss.T is set to the T by adapter.function()
        # ********************************************************************************
        T = np.zeros(shape=(N, (1 + SL)), dtype=TYPE_LABEL)
        T[::, 0] = TYPE_LABEL(1)
        assert embedding.all_equal(T.reshape(-1, 1), loss.T), \
            "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T)

        # ********************************************************************************
        # Constraint:
        #   Expected loss is sum(sigmoid_cross_entropy_log_loss(Y, T)) / (N*(1+SL))
        #   The batch size for the Log Loss is (N*(1+SL))
        # ********************************************************************************
        EJ, EP = sigmoid_cross_entropy_log_loss(X=Z, T=T.reshape(-1, 1))
        EL = np.sum(EJ, dtype=TYPE_FLOAT) / batch_size

        assert embedding.all_close(EL, L), \
            "Expected EL=L but EL=\n%s\nL=\n%s\nDiff=\n%s\n" % (EL, L, (EL-L))

        # ================================================================================
        # Backward path
        # ================================================================================
        # ********************************************************************************
        # Constraint:
        #   Expected dL/dY from the Log Loss is (P-T)/N
        # ********************************************************************************
        EDY = (sigmoid(Y) - T.astype(TYPE_FLOAT)) / batch_size
        assert EDY.shape == Y.shape

        dY = adapter.gradient(loss.gradient(TYPE_FLOAT(1)))
        assert dY.shape == Y.shape
        assert embedding.all_close(EDY, dY), \
            "Expected EDY==dY. EDY=\n%s\nDiff\n%s\n" % (EDY, (EDY-dY))

    profiler.disable()
    profiler.print_stats(sort="cumtime")