Ejemplo n.º 1
0
def arg_l_arg_r_pairs_vector(args_file, file_contains_context=False, has_header=True):
    """
    """
    logging.info("creating arg pairs class vector '{}'".format(args_file))
    temp = []
    if file_contains_context:
        ctx_argl_argr_entail = tio.read_args_w_ctx(args_file, has_header=has_header)
    else:
        argl_argr_entail = tio.read_args_wo_ctx(args_file, has_header=has_header)
        def append_empty_context(tuples):
            for l,r,e in tuples:
                yield '', l, r, e
        ctx_argl_argr_entail = append_empty_context(argl_argr_entail)

    d_triples = td.TripleDict() # rows
    duplicates = 0
    contradicting_duplicates = 0
    for ctx, arg_l, arg_r, entailing in ctx_argl_argr_entail:
        i = d_triples.add((ctx, arg_l, arg_r))
        if i < len(temp):
            label = 1 if entailing.strip().lower() == 'true' else 0
            print("omitting duplicate example: '{} {} {} {}' ".format(ctx, arg_l, arg_r, entailing) ,file=sys.stderr)
            duplicates += 1
            if temp[i] != label:
                print("duplicate example has different label: '{}' vs. '{}'".format(temp[i], label) ,file=sys.stderr)
                contradicting_duplicates += 1
        else:
            temp.append(1 if entailing.strip().lower() == 'true' else 0)
    vec = np.array(temp, dtype=np.float64)
    logging.info("finished creating arg pairs class vector '{}'".format(args_file))
    logging.info("found {} duplicate examples with {} having contradicting labels.".format(duplicates, contradicting_duplicates))
    return vec, d_triples
Ejemplo n.º 2
0
def get_train_test_indexes_presplit(d_triples):
    """
    """
    t1 = tio.read_args_w_ctx('../data/updates/4/args_v2_am.tsv', has_header=False)
    train_ids = []
    for ctx, arg_l, arg_r, __ in t1:
        id_ = d_triples.get_triple_id((ctx, arg_l, arg_r))
        train_ids.append(id_)

    t2 = tio.read_args_w_ctx('../data/updates/4/args_v2_nz.tsv', has_header=False)
    test_ids = []
    for ctx, arg_l, arg_r, __ in t2:
        id_ = d_triples.get_triple_id((ctx, arg_l, arg_r))
        test_ids.append(id_)

    return train_ids, test_ids
Ejemplo n.º 3
0
def get_train_test_indexes_presplit(d_triples):
    """
    """
    t1 = tio.read_args_w_ctx('../data/updates/4/args_v2_am.tsv',
                             has_header=False)
    train_ids = []
    for ctx, arg_l, arg_r, __ in t1:
        id_ = d_triples.get_triple_id((ctx, arg_l, arg_r))
        train_ids.append(id_)

    t2 = tio.read_args_w_ctx('../data/updates/4/args_v2_nz.tsv',
                             has_header=False)
    test_ids = []
    for ctx, arg_l, arg_r, __ in t2:
        id_ = d_triples.get_triple_id((ctx, arg_l, arg_r))
        test_ids.append(id_)

    return train_ids, test_ids
Ejemplo n.º 4
0
def arg_l_arg_r_pairs_vector(args_file,
                             file_contains_context=False,
                             has_header=True):
    """
    """
    logging.info("creating arg pairs class vector '{}'".format(args_file))
    temp = []
    if file_contains_context:
        ctx_argl_argr_entail = tio.read_args_w_ctx(args_file,
                                                   has_header=has_header)
    else:
        argl_argr_entail = tio.read_args_wo_ctx(args_file,
                                                has_header=has_header)

        def append_empty_context(tuples):
            for l, r, e in tuples:
                yield '', l, r, e

        ctx_argl_argr_entail = append_empty_context(argl_argr_entail)

    d_triples = td.TripleDict()  # rows
    duplicates = 0
    contradicting_duplicates = 0
    for ctx, arg_l, arg_r, entailing in ctx_argl_argr_entail:
        i = d_triples.add((ctx, arg_l, arg_r))
        if i < len(temp):
            label = 1 if entailing.strip().lower() == 'true' else 0
            print("omitting duplicate example: '{} {} {} {}' ".format(
                ctx, arg_l, arg_r, entailing),
                  file=sys.stderr)
            duplicates += 1
            if temp[i] != label:
                print("duplicate example has different label: '{}' vs. '{}'".
                      format(temp[i], label),
                      file=sys.stderr)
                contradicting_duplicates += 1
        else:
            temp.append(1 if entailing.strip().lower() == 'true' else 0)
    vec = np.array(temp, dtype=np.float64)
    logging.info(
        "finished creating arg pairs class vector '{}'".format(args_file))
    logging.info(
        "found {} duplicate examples with {} having contradicting labels.".
        format(duplicates, contradicting_duplicates))
    return vec, d_triples