Esempio n. 1
0
def reorder_numbered_placeholders(input1, input2, by_group=True):
    """ Given a stream, identifies placeholders that are suffixed by '_$DIGITS'.  These are considered to be
    argument placeholders, and will be renumbered with those prefixes in place.
    If we are reordering by group, numbering is done by counts in group"""
    eid = 0
    toks1 = process_sentence(input1)
    toks2 = process_sentence(input2)
    oldent2newent = {}
    group_hist = {}
    # First pass scan for entities.  Add in replacements
    for tok1 in toks1:
        matched = is_numbered_placeholder(tok1)
        if matched is not None:
            if tok1 not in oldent2newent:
                if by_group:
                    if matched not in group_hist:
                        group_hist[matched] = 0
                    next_id = group_hist[matched]
                    group_hist[matched] += 1
                else:
                    next_id = eid
                new_entity = matched.format(next_id)
                eid += 1
                oldent2newent[tok1] = new_entity
    # Replace in lang1 and lang2
    new_toks1 = [oldent2newent[t] if t in oldent2newent else t for t in toks1]
    new_toks2 = [subsfirst(t, oldent2newent) for t in toks2]
    rlookup = {}
    for oldent, newent in oldent2newent.items():
        rlookup[newent] = oldent
    return " ".join(new_toks1), " ".join(new_toks2), rlookup
Esempio n. 2
0
def normalize_sal_entities(input1, input2, ent_prefix="id__"):
    """ Get the entity IDs in order, and issues new in-sequence IDs."""
    eid = 0
    toks1 = process_sentence(input1)
    toks2 = process_sentence(input2)
    oldent2newent = {}
    # First pass scan for entities
    for tok1 in toks1:
        if is_entity(tok1):
            if tok1 not in oldent2newent:
                new_entity = "{}{}".format(ent_prefix, eid)
                eid += 1
                oldent2newent[tok1] = new_entity
        elif is_number(tok1):
            if tok1 not in oldent2newent:
                new_entity = "number__{}".format(eid)
                eid += 1
                oldent2newent[tok1] = new_entity
    # Replace in lang1 and lang2
    new_toks1 = [oldent2newent[t] if t in oldent2newent else t for t in toks1]
    new_toks2 = [oldent2newent[t] if t in oldent2newent else t for t in toks2]
    rlookup = {}
    for oldent, newent in oldent2newent.items():
        rlookup[newent] = oldent
    return " ".join(new_toks1), " ".join(new_toks2), rlookup
Esempio n. 3
0
def indexes_from_sentence(lang, sentence):
    indices = [
        lang.word2index.get(word, UNK_token)
        for word in process_sentence(sentence)
    ]
    if match_parens:
        return indices
    else:
        return indices + [EOS_token]
Esempio n. 4
0
def evaluate_pairs(evaluate_fn, eval_pairs):
    num_correct = 0
    res_str = ""

    def emit(str):
        nonlocal res_str
        res_str += str
        res_str += "\n"
        return res_str

    i_iter = tqdm(range(len(eval_pairs)))
    print("Evaluating pairs")
    for i in i_iter:
        pair = eval_pairs[i]
        lang1_str, lang2_str = pair
        emit("# {}".format(i))
        lang1_toks = process_sentence(lang1_str)
        lang2_toks = process_sentence(lang2_str)
        num_lang1_toks = len(lang1_toks)
        num_lang2_toks = len(lang2_toks)
        guessed_words, _ = evaluate_fn(lang1_str)
        guessed_sentence = ' '.join(guessed_words)
        num_guess_toks = len(guessed_words)
        is_correct = exact_match(lang2_toks, guessed_words)
        if is_correct:
            num_correct += 1
        emit('Lang (#={}):\t{}'.format(num_lang1_toks, pair[0]))
        emit('Gold (#={}):\t{}'.format(num_lang2_toks, pair[1]))
        if is_correct:
            emit('Guess (#={}):\t{}'.format(num_guess_toks, guessed_sentence))
        else:
            emit('* Guess (#={}):\t{}'.format(num_guess_toks, guessed_sentence))
        emit('')
    acc = num_correct / len(eval_pairs)
    emit("Exact acc = {:.5f}".format(acc))
    return acc, res_str
Esempio n. 5
0
def show_attention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + process_sentence(input_sentence) + ['<EOS>'],
                       rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
Esempio n. 6
0
def indexes_from_sentence(lang, sentence):
    return [
        lang.word2index.get(word, UNK_token)
        for word in process_sentence(sentence)
    ]