Beispiel #1
0
def single_test_propagate():
    # Call the code under test.
    propagated = propagate(
        COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS)
    )

    # The propagated terms should be the terms from the conbined index,
    # starting with the terms of the frame and going up to the last new
    # term from the graph that is not in English.
    assert len(propagated) == len(FRAME) + len(
        NEW_NON_ENGLISH_TERMS
    ), 'Incorrect number {} (should be {}) of propagated terms.'.format(
        len(propagated), len(FRAME) + len(NEW_NON_ENGLISH_TERMS)
    )
    for i_term in range(len(propagated)):
        assert (
            propagated.index[i_term] == COMBINED_INDEX[i_term]
        ), 'Propagated output terms do not agree with the input terms.'

    # The original embedding should not be altered.
    assert_allclose(
        propagated.values[: len(FRAME), :],
        FRAME.values,
        err_msg='Propagation changed an input embedding vector.',
    )

    # Terms not from the original embedding should be assigned the
    # average of the vectors of their neighbors of lesser rank, if all
    # of those neighbors are either from the original embedding or non-
    # English.
    for term in NEW_NON_ENGLISH_TERMS:
        count = 0
        sum = np.zeros((EMBEDDING_DIM,), dtype=np.float32)
        for other_term in COMBINED_INDEX:
            if (term, other_term) in EDGE_SET and RANKS[other_term] < RANKS[term]:
                if other_term in NEW_ENGLISH_TERMS:
                    break
                count += 1
                sum = np.add(sum, propagated.loc[other_term])
        else:
            assert_allclose(
                propagated.loc[term],
                sum / count,
                err_msg='Incorrect propagated vector for term {}'.format(term),
            )
def single_test_propagate():
    # Call the code under test.
    propagated = propagate(
        COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS)
    )

    # The propagated terms should be the terms from the conbined index,
    # starting with the terms of the frame and going up to the last new
    # term from the graph that is not in English.
    assert (len(propagated) == len(FRAME) + len(NEW_NON_ENGLISH_TERMS)), \
        'Incorrect number {} (should be {}) of propagated terms.'.format(len(propagated), len(FRAME) + len(NEW_NON_ENGLISH_TERMS))
    for i_term in range(len(propagated)):
        assert (propagated.index[i_term] == COMBINED_INDEX[i_term]), \
            'Propagated output terms do not agree with the input terms.'
    
    # The original embedding should not be altered.
    assert_allclose(propagated.values[:len(FRAME), :], FRAME.values,
                    err_msg='Propagation changed an input embedding vector.')

    # Terms not from the original embedding should be assigned the 
    # average of the vectors of their neighbors of lesser rank, if all
    # of those neighbors are either from the original embedding or non-
    # English.
    for term in NEW_NON_ENGLISH_TERMS:
        count = 0
        sum = np.zeros((EMBEDDING_DIM,), dtype=np.float32)
        for other_term in COMBINED_INDEX:
            if (
                (term, other_term) in EDGE_SET
                    and RANKS[other_term] < RANKS[term]
            ):
                if other_term in NEW_ENGLISH_TERMS:
                    break
                count += 1
                sum = np.add(sum, propagated.loc[other_term])
        else:
            assert_allclose(
                propagated.loc[term], sum/count,
                err_msg='Incorrect propagated vector for term {}'.format(term)
            )
Beispiel #3
0
def single_test_sharded_propagate():
    # Run the sharded propagation code over the test data in 2 shards.
    # We patch several functions with mock objects:  sharded_propagate reads
    # an assoc edge file, so we patch builtins.open to give sharded_propagate
    # the test data graph as that input.  It reads an embedding (a dataframe)
    # as well, and we patch load_hdf to give it the test data frame.  It writes
    # a shard file for each shard, so we patch save_hdf with a mock object
    # that we will later query to retrieve the output shards for testing.
    # Finally we patch make_adjacency_matrix with a mock object that returns
    # the known good test data for the adjacency matrix, combined index, and
    # number of new terms in English, to make this test independent of any
    # failures of that function.
    nshards = 2
    shard_collector = Mock(return_value=None)  # save_hdf returns None
    with patch('builtins.open', return_value=io.StringIO(ASSOC_FILE_CONTENTS)), \
         patch('conceptnet5.vectors.propagate.make_adjacency_matrix',
               return_value=(ADJACENCY_MATRIX, COMBINED_INDEX, len(NEW_ENGLISH_TERMS))), \
         patch('conceptnet5.vectors.propagate.load_hdf', return_value=FRAME), \
         patch('conceptnet5.vectors.propagate.save_hdf', shard_collector):
        sharded_propagate('ignored_assoc_file',
                          'ignored_embedding_file',
                          'shard_filename_root',
                          nshards=nshards)

    # Run unsharded propagation for comparison.
    propagated = propagate(COMBINED_INDEX, FRAME, ADJACENCY_MATRIX,
                           len(NEW_ENGLISH_TERMS))

    # Check that two shard files were written, to the correct filenames.
    shard_arg = 0  # shard is 1st arg to save_hdf
    fname_arg = 1  # filename is 2nd arg to save_hdf.
    assert (len(shard_collector.call_args_list) == nshards), \
        'Incorrect number {} (should be {}) of shards written.'.format(
            len(shard_collector.call_args_list), nshards)
    for i_shard in range(nshards):
        # Get the positional argument in the filename position of the (i_shard)-th
        # call to the shard_collector Mock object (which mocks save_hdf).
        filename = extract_positional_arg(shard_collector, i_shard, fname_arg)
        assert (filename == 'shard_filename_root.shard{}'.format(i_shard)), \
            'Shard {} written to incorrect file name {}.'.format(i_shard, filename)

    # The shards should agree with the appropriate pieces of the unsharded output.
    for i_shard in range(nshards):
        # Get the positional argument in the shard dataframe position of the
        # (i-shard)-th call to the shard_collector Mock object (which mocks
        # save_hdf).
        shard = extract_positional_arg(shard_collector, i_shard, shard_arg)
        shard_start_dim = i_shard * EMBEDDING_DIM // nshards
        shard_end_dim = shard_start_dim + EMBEDDING_DIM // nshards
        assert (len(shard.index) == len(propagated.index)), \
            'Shard {} has incorrect length {} (should be {}).'.format(
                i_shard, len(shard.index), len(propagated.index))
        for shard_term, ref_term in zip(shard.index, propagated.index):
            assert (shard_term == ref_term), \
                'Shard {} has term {} where reference has {}.'.format(
                    i_shard, shard_term, ref_term)
        assert_allclose(
            shard.values,
            propagated.values[:, shard_start_dim:shard_end_dim],
            err_msg='Shard {} has incorrect propagated vectors.'.format(
                i_shard))
def single_test_sharded_propagate():
    # Run the sharded propagation code over the test data in 2 shards.
    # We patch several functions with mock objects:  sharded_propagate reads
    # an assoc edge file, so we patch builtins.open to give sharded_propagate
    # the test data graph as that input.  It reads an embedding (a dataframe)
    # as well, and we patch load_hdf to give it the test data frame.  It writes
    # a shard file for each shard, so we patch save_hdf with a mock object
    # that we will later query to retrieve the output shards for testing.
    # Finally we patch make_adjacency_matrix with a mock object that returns
    # the known good test data for the adjacency matrix, combined index, and
    # number of new terms in English, to make this test independent of any
    # failures of that function.
    nshards = 2
    shard_collector = Mock(return_value=None)  # save_hdf returns None
    with patch('builtins.open', return_value=io.StringIO(ASSOC_FILE_CONTENTS)), \
         patch('conceptnet5.vectors.propagate.make_adjacency_matrix',
               return_value=(ADJACENCY_MATRIX, COMBINED_INDEX, len(NEW_ENGLISH_TERMS))), \
         patch('conceptnet5.vectors.propagate.load_hdf', return_value=FRAME), \
         patch('conceptnet5.vectors.propagate.save_hdf', shard_collector):
        sharded_propagate(
            'ignored_assoc_file',
            'ignored_embedding_file',
            'shard_filename_root',
            nshards=nshards
        )

    # Run unsharded propagation for comparison.
    propagated = propagate(
        COMBINED_INDEX, FRAME, ADJACENCY_MATRIX, len(NEW_ENGLISH_TERMS)
    )

    # Check that two shard files were written, to the correct filenames.
    shard_arg = 0  # shard is 1st arg to save_hdf
    fname_arg = 1  # filename is 2nd arg to save_hdf.
    assert (len(shard_collector.call_args_list) == nshards), \
        'Incorrect number {} (should be {}) of shards written.'.format(
            len(shard_collector.call_args_list), nshards)
    for i_shard in range(nshards):
        # Get the positional argument in the filename position of the (i_shard)-th
        # call to the shard_collector Mock object (which mocks save_hdf).
        filename = extract_positional_arg(shard_collector, i_shard, fname_arg)
        assert (filename == 'shard_filename_root.shard{}'.format(i_shard)), \
            'Shard {} written to incorrect file name {}.'.format(i_shard, filename)

    # The shards should agree with the appropriate pieces of the unsharded output.
    for i_shard in range(nshards):
        # Get the positional argument in the shard dataframe position of the
        # (i-shard)-th call to the shard_collector Mock object (which mocks
        # save_hdf).
        shard = extract_positional_arg(shard_collector, i_shard, shard_arg)
        shard_start_dim = i_shard * EMBEDDING_DIM // nshards
        shard_end_dim = shard_start_dim + EMBEDDING_DIM // nshards
        assert (len(shard.index) == len(propagated.index)), \
            'Shard {} has incorrect length {} (should be {}).'.format(
                i_shard, len(shard.index), len(propagated.index))
        for shard_term, ref_term in zip(shard.index, propagated.index):
            assert (shard_term == ref_term), \
                'Shard {} has term {} where reference has {}.'.format(
                    i_shard, shard_term, ref_term)
        assert_allclose(
            shard.values, propagated.values[:, shard_start_dim:shard_end_dim],
            err_msg='Shard {} has incorrect propagated vectors.'.format(i_shard)
        )