Beispiel #1
0
def prep_tree_srm_arg(relation_list, arg_pos, wbm, max_length, all_left_branching=False):
    assert arg_pos == 1 or arg_pos == 2
    n_samples = len(relation_list)
    w_indices = np.zeros((2 * max_length, n_samples)).astype("int64")
    c_mask = np.zeros((max_length, n_samples), dtype=config.floatX)
    node_mask = np.zeros((2 * max_length, n_samples), dtype=config.floatX)
    # children = np.zeros((max_length, n_samples, 2), dtype='int64')
    children = np.zeros((n_samples, max_length, 3), dtype="int64")
    for i, relation in enumerate(relation_list):
        if all_left_branching:
            parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        else:
            parse_tree = tree_util.find_parse_tree(relation, arg_pos)
            if len(parse_tree.leaves()) == 0:
                parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        indices = wbm.index_tokens(parse_tree.leaves(), ignore_OOV=False)

        sequence_length = min(max_length, len(indices))
        w_indices[:sequence_length, i] = indices[:sequence_length]

        ordering_matrix, num_leaves = tree_util.reverse_toposort(parse_tree)
        num_nodes = min(2 * max_length, ordering_matrix.shape[0])
        print num_leaves, num_nodes
        # assert(num_nodes >= num_leaves)
        if num_nodes > num_leaves:
            num_inner_nodes = num_nodes - num_leaves
            children[i, :num_inner_nodes, :] = ordering_matrix[num_leaves:num_nodes, :]
            c_mask[:num_inner_nodes, i] = 1.0
            node_mask[num_leaves:num_nodes, i] = 1.0
    children = np.swapaxes(children, 0, 1)
    embedding_series = (
        wbm.wm[w_indices.flatten()].reshape([max_length * 2, n_samples, wbm.num_units]).astype(config.floatX)
    )
    return embedding_series, children, c_mask, node_mask
Beispiel #2
0
def prep_tree_srm_arg(relation_list, arg_pos, wbm, max_length, 
        all_left_branching=False, node_label_alphabet={}):
    """Make the matrices from the data required for the tree model
    
    T = number of time steps
    N = number of samples
    d = dimensionality of the embedding
    k = dimensionality of the 

    embedding_series: 2T x N x d serrated matrix word embedding for the leaves
    children : T x N x 3 children serrated matrix 
    c_mask : T x N masking matrix for children matrix
    node_mask : 2T x N masking matrix for the internal nodes 
            (for embedding_series) nice for computing mean h or sum h
    node_label_tensor : 2T x N x k. This masks embedding_series matrix
    """
    assert arg_pos == 1 or arg_pos == 2
    n_samples = len(relation_list)
    w_indices = np.zeros((2 * max_length, n_samples)).astype('int64')
    c_mask = np.zeros((max_length, n_samples), dtype=config.floatX)
    node_mask = np.zeros((2 * max_length, n_samples), dtype=config.floatX)
    children = np.zeros((n_samples, max_length, 3), dtype='int64')
    node_label_tensor = np.zeros((2 * max_length, n_samples, len(node_label_alphabet)), dtype=config.floatX)
    for i, relation in enumerate(relation_list):
        if all_left_branching:
            parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        else:
            parse_tree = tree_util.find_parse_tree(relation, arg_pos)
            if len(parse_tree.leaves()) == 0:
                parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        indices = wbm.index_tokens(parse_tree.leaves(), ignore_OOV=False)

        sequence_length = min(max_length, len(indices))
        w_indices[:sequence_length, i] = indices[:sequence_length]

        ordering_matrix, node_label_list, num_leaves = \
                tree_util.reverse_toposort(parse_tree)
        num_nodes = min(2 * max_length, ordering_matrix.shape[0])
        if num_nodes > num_leaves:
            num_inner_nodes = num_nodes - num_leaves
            children[i, :num_inner_nodes, :] = ordering_matrix[num_leaves:num_nodes, :]
            c_mask[:num_inner_nodes, i] = 1.
            node_mask[num_leaves:num_nodes, i] = 1. 
            if len(node_label_alphabet) > 0:
                for t, node_label in enumerate(node_label_list):
                    if node_label is not None and t < (2 * max_length):
                        if node_label in node_label_alphabet:
                            label_index = node_label_alphabet[node_label]
                        else:
                            label_index = node_label_alphabet['OTHERS']
                        node_label_tensor[t, i, label_index] = 1.

    children = np.swapaxes(children, 0, 1)
    embedding_series = \
        wbm.wm[w_indices.flatten()].\
            reshape([max_length * 2, n_samples, wbm.num_units]).\
            astype(config.floatX)
    return embedding_series, children, c_mask, node_mask, node_label_tensor
Beispiel #3
0
def prep_tree_arg(relation_list, arg_pos, all_left_branching=False):
    parse_trees = []
    for i, relation in enumerate(relation_list):
        if all_left_branching:
            parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        else:
            parse_tree = tree_util.find_parse_tree(relation, arg_pos)
            print parse_tree
            if len(parse_tree.leaves()) == 0:
                print "use left branching tree because parse is empty"
                parse_tree = tree_util.left_branching_tree(relation, arg_pos)
            else:
                parse_tree = tree_util.binarize_tree(parse_tree)
            print parse_tree
        parse_trees.append(parse_tree)
    return parse_trees
Beispiel #4
0
def prep_tree_arg(relation_list, arg_pos, all_left_branching=False):
    parse_trees = []
    for i, relation in enumerate(relation_list):
        if all_left_branching:
            parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        else:
            parse_tree = tree_util.find_parse_tree(relation, arg_pos)
            print parse_tree
            if len(parse_tree.leaves()) == 0:
                print 'use left branching tree because parse is empty'
                parse_tree = tree_util.left_branching_tree(relation, arg_pos)
            else:
                parse_tree = tree_util.binarize_tree(parse_tree)
            print parse_tree
        parse_trees.append(parse_tree)
    return parse_trees
Beispiel #5
0
def prep_tree_srm_arg(relation_list,
                      arg_pos,
                      wbm,
                      max_length,
                      all_left_branching=False):
    assert arg_pos == 1 or arg_pos == 2
    n_samples = len(relation_list)
    w_indices = np.zeros((2 * max_length, n_samples)).astype('int64')
    c_mask = np.zeros((max_length, n_samples), dtype=config.floatX)
    node_mask = np.zeros((2 * max_length, n_samples), dtype=config.floatX)
    #children = np.zeros((max_length, n_samples, 2), dtype='int64')
    children = np.zeros((n_samples, max_length, 3), dtype='int64')
    for i, relation in enumerate(relation_list):
        if all_left_branching:
            parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        else:
            parse_tree = tree_util.find_parse_tree(relation, arg_pos)
            if len(parse_tree.leaves()) == 0:
                parse_tree = tree_util.left_branching_tree(relation, arg_pos)
        indices = wbm.index_tokens(parse_tree.leaves(), ignore_OOV=False)

        sequence_length = min(max_length, len(indices))
        w_indices[:sequence_length, i] = indices[:sequence_length]

        ordering_matrix, num_leaves = tree_util.reverse_toposort(parse_tree)
        num_nodes = min(2 * max_length, ordering_matrix.shape[0])
        print num_leaves, num_nodes
        #assert(num_nodes >= num_leaves)
        if num_nodes > num_leaves:
            num_inner_nodes = num_nodes - num_leaves
            children[i, :num_inner_nodes, :] = ordering_matrix[
                num_leaves:num_nodes, :]
            c_mask[:num_inner_nodes, i] = 1.
            node_mask[num_leaves:num_nodes, i] = 1.
    children = np.swapaxes(children, 0, 1)
    embedding_series = \
        wbm.wm[w_indices.flatten()].\
            reshape([max_length * 2, n_samples, wbm.num_units]).\
            astype(config.floatX)
    return embedding_series, children, c_mask, node_mask