Beispiel #1
0
    def __init__(self, n_word_vocab, n_role_vocab, n_factors_emb, n_hidden,
                 word_vocabulary, role_vocabulary, unk_word_id, unk_role_id,
                 missing_word_id, using_dropout, dropout_rate, optimizer, loss,
                 metrics):

        self.n_word_vocab = n_word_vocab
        self.n_role_vocab = n_role_vocab
        self.n_factors_emb = n_factors_emb
        self.n_hidden = n_hidden

        self.word_vocabulary = word_vocabulary
        self.role_vocabulary = role_vocabulary
        self.word_decoder = get_reverse_map(word_vocabulary)
        self.role_decoder = get_reverse_map(role_vocabulary)

        self.unk_role_id = unk_role_id
        self.unk_word_id = unk_word_id
        self.missing_word_id = missing_word_id
        self.using_dropout = using_dropout
        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.loss = loss
Beispiel #2
0
def get_top_predictions(inputs, target, model, raw_word_list, n=5):
    """ Returns the top predicted filler for a target role, given a set of input roles + fillers
        
    Keyword arguments:
    inputs -- A dictionary of inputs with the role as the key and the filler as the value.
    target -- A singleton dictionary containing the target role as the key and target filler as the value.
    model -- The loaded model with which to make predictions
    raw_word_list -- A dictionary of vocabulary
    n -- The number of top predictions that should be retrieved
    """
    #print(inputs)
    raw_word_list.update(inputs)
    #print(raw_word_list)

    assert len(raw_word_list) == len(model.role_vocabulary)

    t_r = [
        model.role_vocabulary.get(r, model.unk_role_id) for r in target.keys()
    ]
    t_w = [model.unk_word_id]

    input_roles_words = {}
    for r, w in raw_word_list.items():
        input_roles_words[model.role_vocabulary[r]] = utils.input_word_index(
            model.word_vocabulary, w, model.unk_word_id, warn_unk=False)

    #print input_roles_words, t_r[0]
    input_roles_words.pop(t_r[0])

    x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64)
    x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64)
    y_w_i = numpy.asarray(t_w, dtype=numpy.int64)
    y_r_i = numpy.asarray(t_r, dtype=numpy.int64)

    predicted_word_indices = model.top_words(x_w_i, x_r_i, y_w_i, y_r_i, n)
    results = []
    reverse_vocabulary = utils.get_reverse_map(model.word_vocabulary)

    for t_w_i in predicted_word_indices:
        t_w = model.word_vocabulary.get(t_w_i, model.unk_word_id)
        y_w_i = numpy.asarray([t_w_i], dtype=numpy.int64)
        p = model.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1,
                          verbose=0)[0]
        n = numpy.round(p / 0.005)
        fb = numpy.floor(n)
        hb = n % 2
        lemma = reverse_vocabulary[int(t_w_i)]
        #print u"{:<5} {:7.6f} {:<20} ".format(i+1, float(p), lemma) + u"\u2588" * int(fb) + u"\u258C" * int(hb)
        results.append((lemma, p))

    return results
def punctuator(input_file, vocab_file, punct_vocab_file, output_file,
               get_post):
    # Convert text to ids. (NOTE: fake outputs)
    vocabulary = load_vocabulary(vocab_file)
    punctuations = get_punctuations(punct_vocab_file)
    punct_vocab_reverse_map = utils.get_reverse_map(punctuations)
    inputs, outputs, lens = inference_sentences_to_ids(input_file, vocabulary,
                                                       punctuations)

    # Get predicts
    if get_post:
        posteriors = get_predicts(inputs, outputs, lens, get_post)
        write_posteriors(input_file, posteriors, punct_vocab_reverse_map,
                         output_file)
        return
    else:
        predicts = get_predicts(inputs, outputs, lens)

    # Write punctuations
    write_punctuations(input_file, predicts, punct_vocab_reverse_map,
                       output_file)
Beispiel #4
0
                        output_file.write("%s %s" % (punctuation[:1], word))
                    else:
                        output_file.write(" %s %s" % (punctuation, word))

            else:
                word = token


if __name__ == "__main__":

    if len(sys.argv) > 3:
        model_name = sys.argv[1]
        net = utils.load_model(model_name)
        net.batch_size = 1
        net.reset_state()
        punctuation_reverse_map = utils.get_reverse_map(net.out_vocabulary)

        write_readable_text = bool(int(sys.argv[2]))

        output_file_path = sys.argv[3]
        if output_file_path == "-":
            output_file_path = sys.stdout

        if len(sys.argv) > 4:
            with open(sys.argv[4], 'r') as unpunctuated_file:
                unpunctuated_text = " ".join(unpunctuated_file.readlines())
        else:
            unpunctuated_text = " ".join(sys.stdin.readlines())

        write_punctuations(net, unpunctuated_text, output_file_path,
                           punctuation_reverse_map, write_readable_text)
def evaluate(model_name, experiment_name, batch_size):
    MODEL_NAME = experiment_name
    repr_file = os.path.join(MODEL_PATH, 'confusionM_' + MODEL_NAME)

    description = model_builder.load_description(MODEL_PATH, MODEL_NAME)
    net = model_builder.build_model(model_name, description)
    net.load(MODEL_PATH, MODEL_NAME, description)

    n_roles = len(net.role_vocabulary)
    print(net.role_vocabulary)  #Added () to print (team1-change)
    print("unk_word_id", net.unk_word_id)
    print("missing_word_id", net.missing_word_id)

    net.model.summary()
    print(net.model.metrics_names)  #Added () to print (team1-change)
    reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary)

    test_sample_size = config.OCT_TEST_SIZE
    test_steps = test_sample_size / batch_size
    # # DEBUG
    # test_steps = 10

    print('Testing...')  #Added () to print (team1-change)
    test_start = time.process_time()  #Changed from time.clock() (team1-change)

    # Always use generator in Keras
    if re.search('NAME_WHICH_YOU_NEED_OLD_BATCHER', experiment_name):
        test_gen = get_minibatch(DATA_PATH + "NN_test",
                                 net.unk_word_id,
                                 net.unk_role_id,
                                 net.missing_word_id,
                                 n_roles,
                                 random=False,
                                 batch_size=batch_size)
    else:
        test_gen = generator(DATA_PATH + "NN_test",
                             model_name,
                             net.unk_word_id,
                             net.unk_role_id,
                             net.missing_word_id,
                             n_roles,
                             random=False,
                             batch_size=batch_size)

    # Test the model
    test_result = net.model.evaluate_generator(generator=test_gen,
                                               steps=test_steps,
                                               max_q_size=1,
                                               workers=1,
                                               pickle_safe=False)
    print('test_result', test_result)

    # Compute confusion matrix
    metrics_names = net.model.metrics_names
    result_dict = {(x, 0) for x in metrics_names}
    batch_n = 0
    confusionM = np.zeros((n_roles, n_roles), dtype='int32')
    ppl_role_list = dict()
    ppl_role = dict()

    result_list = []
    for ([i_w, i_r, t_w, t_r], _) in generator(DATA_PATH + "NN_test",
                                               model_name,
                                               net.unk_word_id,
                                               net.unk_role_id,
                                               net.missing_word_id,
                                               n_roles,
                                               random=False,
                                               batch_size=batch_size):
        result_role = net.predict_role(i_w, i_r, t_w, t_r, batch_size)
        result_word_likelihood = net.predict(i_w, i_r, t_w, t_r, batch_size)[0]
        neg_log_likelihoods = -np.log(result_word_likelihood)

        for i, row in enumerate(neg_log_likelihoods, start=0):
            target_word = t_w[i][0]
            target_role = t_r[i][0]
            neg_log_likelihood = row[target_word]
            ppl_role_list.setdefault(target_role,
                                     []).append(neg_log_likelihood)

        for i, true_r in enumerate(t_r, start=0):
            confusionM[true_r, result_role[i]] += 1
            if true_r == result_role[i]:
                result_list.append(1)
        batch_n += 1
        print(batch_n)  #Added () to print (team1-change)
        if batch_n >= test_steps:
            break

    for k, v in ppl_role_list.items():
        neg_log_likelihood_role = np.mean(np.array(v))
        ppl_role[k] = np.exp(neg_log_likelihood_role)

    print("Confusion Matrix: ")  #Added () to print (team1-change)
    print("    A0,  A1, LOC, TMP, MNR,   V, <UNKNOWN>"
          )  #Added () to print (team1-change)
    print(confusionM)  #Added () to print (team1-change)
    np.savetxt('confusionM_' + experiment_name + '.csv',
               confusionM,
               delimiter=',')
    np.savetxt('result_list_' + experiment_name + '.csv',
               result_list,
               delimiter=',')

    stats(net, confusionM)

    print("Loss(neg_log_likelihood) by role: "
          )  #Added () to print (team1-change)
    for r in ppl_role.keys():
        print(reverse_role_vocabulary[r], np.log(ppl_role[r]))

    print("PPL by role: ")  #Added () to print (team1-change)
    for r in ppl_role.keys():
        print(reverse_role_vocabulary[r], ppl_role[r])

    with open(repr_file, 'w') as f_out:
        f_out.write('[')
        for i in range(n_roles):
            f_out.write('[')
            for j in range(n_roles):
                f_out.write(str(confusionM[i][j]) + ", ")
            f_out.write('] \n')
        f_out.write(']')

    test_end = time.process_time()  #Changed from time.clock() (team1-change)
    print('test time: %f, sps: %f' %
          (test_end - test_start, test_steps * batch_size /
           (test_end - test_start)))  #Added () to print (team1-change)
def evaluate(model_name,
             experiment_name,
             test_name,
             batch_size,
             VR_SP_SRL=True,
             bootstrapping=False,
             majority_baseline=False):
    MODEL_NAME = experiment_name
    # repr_file = os.path.join(MODEL_PATH, 'confusionM_' + MODEL_NAME)

    description = model_builder.load_description(MODEL_PATH, MODEL_NAME)
    net = model_builder.build_model(model_name, description)
    net.load(MODEL_PATH, MODEL_NAME, description)

    n_roles = len(net.role_vocabulary)
    reverse_word_vocabulary = utils.get_reverse_map(net.word_vocabulary)
    reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary)
    # net.set_0_bias()

    print net.role_vocabulary
    print("unk_word_id", net.unk_word_id)
    print("missing_word_id", net.missing_word_id)

    net.model.summary()

    # print net.model.metrics_names

    test_sample_size = 0
    with open(EVAL_PATH + test_name, 'r') as lines:
        for l in lines:
            test_sample_size += 1
    print(test_sample_size)

    test_steps = test_sample_size / float(batch_size)
    # test_steps = test_sample_size
    # # DEBUG
    # test_steps = 10

    print 'Testing ' + test_name + ' ...'
    print 'VR_SP_SRL: ' + str(VR_SP_SRL)
    test_start = time.clock()

    # if re.search('NNRF_1e8', experiment_name) or re.search('MTRF_dev', experiment_name):
    #     test_gen = get_minibatch(DATA_PATH + "NN_test", net.unk_word_id, net.unk_role_id, net.missing_word_id,
    #             n_roles, random=False, batch_size=batch_size)
    # else:
    #     test_gen = generator(DATA_PATH + "NN_test", model_name, net.unk_word_id, net.unk_role_id, net.missing_word_id,
    #             n_roles, random=False, batch_size=batch_size)

    # # Test the model
    # test_result = net.model.evaluate_generator(
    #         generator = test_gen,
    #         steps = test_steps,
    #         max_q_size = 1,
    #         workers = 1,
    #         pickle_safe = False
    #     )
    # print ('test_result', test_result)

    # Compute confusion matrix
    metrics_names = net.model.metrics_names
    result_dict = {(x, 0) for x in metrics_names}
    batch_n = 0
    confusionM = np.zeros((n_roles, n_roles), dtype='int32')
    ppl_role_list = dict()

    result_list = []
    output_list = []
    for ([i_w, i_r, t_w, t_r], _) in data_gen(EVAL_PATH + test_name,
                                              model_name,
                                              net,
                                              batch_size,
                                              VR_SP_SRL=VR_SP_SRL):
        # zeros = np.zeros(t_r.shape)
        result_role = net.predict_role(i_w, i_r, t_w, t_r, batch_size)

        # word_emb, avg_emb, event_emb = net.avg_emb.predict([i_w, i_r, t_w, t_r], batch_size)
        # print word_emb.shape, avg_emb.shape, event_emb.shape
        # assert np.multiply(word_emb[0][0], avg_emb[0])[0] == event_emb[0][0][0]
        # assert np.multiply(word_emb[0][0], avg_emb[0])[1] == event_emb[0][0][1]

        # test role prediction of MTRF_dev, result: role prediction is useless
        # print i_r
        # print t_r.reshape(-1)
        # print result_role

        # result_word_likelihood = net.predict(i_w, i_r, t_w, t_r, batch_size)[0]
        # neg_log_likelihoods = -np.log(result_word_likelihood)

        # for i, row in enumerate(neg_log_likelihoods, start=0):
        #     target_word = t_w[i][0]
        #     target_role = t_r[i][0]
        #     neg_log_likelihood = row[target_word]
        #     ppl_role_list.setdefault(target_role, []).append(neg_log_likelihood)

        # print i_w, i_r, t_w, t_r

        for i, true_r in enumerate(t_r, start=0):
            # if reverse_role_vocabulary.get(t_r[0][0], '<unknown>') == 'AM-LOC':
            #     print ("input words", [reverse_word_vocabulary.get(w, '<unknown>') for w in i_w[0]])
            #     print ("input roles", [reverse_role_vocabulary.get(r, '<unknown>') for r in i_r[0]])
            #     print ("target word", [reverse_word_vocabulary.get(w, '<unknown>') for w in t_w[0]])
            #     print ("target role", [reverse_role_vocabulary.get(r, '<unknown>') for r in t_r[0]])
            #     print ("predicted role", [reverse_role_vocabulary.get(result_role[i], '<unknown>') for r in t_r[0]])
            #     print ''

            confusionM[true_r, result_role[i]] += 1
            if true_r == result_role[i]:
                result_list.append(1)
            output_list.append((true_r, result_role[i]))
        batch_n += 1
        if batch_n % 100 == 0:
            print(batch_n)
        if batch_n >= test_steps:
            break

    # ppl_role = dict()
    # for k, v in ppl_role_list.items():
    #     neg_log_likelihood_role = np.mean(np.array(v))
    #     ppl_role[k] = np.exp(neg_log_likelihood_role)

    # obtain ZeroR baseline
    print confusionM
    majority = 1
    if majority_baseline == True:
        for i in range(7):
            confusionM[i][majority] = confusionM[i][:].sum()
            confusionM[i][majority - 1] = 0
            confusionM[i][majority + 1:] = 0
    print confusionM

    dir_P, dir_R, dir_F1, precision, recall, F1 = stats(net, confusionM)
    print "Dir: %.2f \t %.2f \t %.2f" % (dir_P, dir_R, dir_F1)

    # np.savetxt('confusionM_' + experiment_name + '.' + test_name.strip('.dat') + '.csv', confusionM, delimiter = ',')
    # np.savetxt('output_' + experiment_name + '.' + test_name.strip('.dat') + '.csv', output_list, delimiter = ',')

    # with open(repr_file, 'w') as f_out:
    #     f_out.write('[')
    #     for i in range(n_roles):
    #         f_out.write('[')
    #         for j in range(n_roles):
    #             f_out.write(str(confusionM[i][j]) + ", ")
    #         f_out.write('] \n')
    #     f_out.write(']')

    # print "Loss(neg_log_likelihood) by role: "
    # for r in ppl_role.keys():
    #     print (reverse_role_vocabulary[r], np.log(ppl_role[r]))

    print("Result by role: ")
    for r in range(len(precision)):
        print('%s: \t %.2f \t %.2f \t %.2f' %
              (reverse_role_vocabulary[r], precision[r], recall[r], F1[r]))

    test_end = time.clock()
    print 'test time: %f, sps: %f' % (test_end - test_start, test_steps *
                                      batch_size / (test_end - test_start))

    if bootstrapping:
        P_mean, P_std, R_mean, R_std, F1_mean, F1_std = bootstrap(
            experiment_name, test_name, net, n_roles, output_list=output_list)

        return P_mean, P_std, R_mean, R_std, F1_mean, F1_std
def query(model_name, experiment_name, inputs, target):
    MODEL_NAME = experiment_name
    description = model_builder.load_description(MODEL_PATH, MODEL_NAME)

    net = model_builder.build_model(model_name, description)
    net.load(MODEL_PATH, MODEL_NAME, description)

    # net.model.summary()
    # print net.model.get_layer(name="embedding_2").get_weights()[0]

    print net.role_vocabulary
    print("unk_word_id", net.unk_word_id)
    print("missing_word_id", net.missing_word_id)
    # net.set_0_bias()

    net.model.summary()

    propbank_map = {
        "subj"  :   "A0",
        "obj"   :   "A1",
        "ARG0"  :   "A0",
        "ARG1"  :   "A1",
        "ARG2"  :   "A2",
    }

    # tr_map = {
    #     "A0": numpy.asarray([[net.role_vocabulary["A0"]]], dtype=numpy.int64),
    #     "A1": numpy.asarray([[net.role_vocabulary["A1"]]], dtype=numpy.int64),
    #     "A2": numpy.asarray([[net.role_vocabulary["<UNKNOWN>"]]], dtype=numpy.int64)
    # }

    # net.word_vocabulary["<NOTHING>"] = net.missing_word_id
    # net.role_vocabulary["<UNKNOWN>"] = net.unk_role_id    

    reverse_vocabulary = utils.get_reverse_map(net.word_vocabulary)
    reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary)    

    print reverse_role_vocabulary

    raw_words = dict((reverse_role_vocabulary[r], reverse_vocabulary[net.missing_word_id]) for r in net.role_vocabulary.values())

    # print raw_words

    raw_words.update(inputs)
    
    # print raw_words
    # print len(raw_words)
    assert len(raw_words) == len(net.role_vocabulary)
    # print repr(raw_words)

    # n = int(sys.argv[3])    
    t_r = [net.role_vocabulary.get(r, net.unk_role_id) for r in target.keys()]
    t_w = [net.word_vocabulary.get(w, net.unk_word_id) for w in target.values()]

    input_roles_words = {}
    for r, w in raw_words.items():
        input_roles_words[net.role_vocabulary[r]] = utils.input_word_index(net.word_vocabulary, w, net.unk_word_id, warn_unk=True)

    print input_roles_words, t_r
    input_roles_words.pop(t_r[0])

    # default_roles_words = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values()))
    # default_roles_words.update(input_roles_words)
    # input_roles_words = default_roles_words
        
    x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64)
    x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64)
    y_w_i = numpy.asarray(t_w, dtype=numpy.int64)
    y_r_i = numpy.asarray(t_r, dtype=numpy.int64)

    topN=20
    predicted_word_indices = net.top_words(x_w_i, x_r_i, y_w_i, y_r_i, topN)
    # print predicted_word_indices
    # print len(predicted_word_indices)

    print(x_w_i, x_r_i, y_w_i, y_r_i)

    p_w = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0]
    print ('p_t_w: ', p_w)

    resultlist = predicted_word_indices
    # print resultlist

    for i, t_w_i in enumerate(resultlist):
        t_w = net.word_vocabulary.get(t_w_i, net.unk_word_id)
        y_w_i = numpy.asarray([t_w_i], dtype=numpy.int64)
        p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0]
        n = numpy.round(p / 0.005)
        fb = numpy.floor(n)
        hb = n % 2
        print u"{:<5} {:7.6f} {:<20} ".format(i+1, float(p), reverse_vocabulary[int(t_w_i)]) + u"\u2588" * int(fb) + u"\u258C" * int(hb)
Beispiel #8
0
            if punctuation.strip() == "":
                sys.stdout.write("%s%s%s" % (punctuation, tagstring, word))
            else:
                sys.stdout.write("%s %s%s" % (punctuation[:1], tagstring, word))

            first_word = False

        else:
            if is_word(token):
                word = token
            else:
                tags.append(token)

    sys.stdout.write("\n")
    sys.stdout.flush()

if __name__ == "__main__":
    
    assert len(sys.argv) > 1, "Give model path as first argument"

    model_path = sys.argv[1]
    net = utils.load_model(model_path)
    net.batch_size = 1

    punctuation_reverse_map = utils.get_reverse_map(net.out_vocabulary)

    for line in iter(sys.stdin.readline, ""):
        net.reset_state()
        write_punctuations(net, punctuation_reverse_map, line)
    
Beispiel #9
0
def pd_themfit(model_name,
               experiment_name,
               df,
               predict_role='V',
               input_roles='all_available_args',
               function='filler_prob',
               n=5,
               debug=False):
    """ Adds a column to a pandas df with a role filler probability.

    For each row in the pandas df, calculates the probability that a particular role filler will fill a
    particular role, given a set of input roles and fillers (from that row).
        
    Keyword arguments:
    model_name -- The name of the model
    experiment_name -- The name of the model plus the name of the experiment, separated by '_'
    df -- The pandas dataframe. Must include columns for all propbank labels in predict_role and input_roles
    predict_role -- the target role (in propbank labels) for which the filler will be predicted (default: 'V')
    input_roles -- the set of roles (in propbank labels) that should be used as inputs (default: 'all_args')
    """
    possible_roles = set(
        ['A0', 'A1', 'AM-LOC', 'AM-TMP', 'AM-MNR', '<UNKNOWN>', 'V'])
    try:
        assert predict_role in df.columns
        assert predict_role in possible_roles
        if input_roles != 'all_available_args':
            for r in input_roles:
                assert r in df.columns
                assert r in possible_roles
    except:
        print("NOT ALL ROLES ARE AVAILABLE AS DF COLUMNS")

    MODEL_NAME = experiment_name

    description = model_builder.load_description(MODEL_PATH, MODEL_NAME)
    net = model_builder.build_model(model_name, description)
    net.load(MODEL_PATH, MODEL_NAME, description)

    bias = net.set_0_bias()

    # net.model.summary()
    # print net.model.get_layer(name="embedding_2").get_weights()[0]

    # If no <UNKNOWN> role in the role vocabulary, add it.
    if net.role_vocabulary.get("<UNKNOWN>", -1) == -1:
        net.role_vocabulary["<UNKNOWN>"] = len(net.role_vocabulary) - 1

    print("Role vocabulary", net.role_vocabulary)
    print("unk_word_id", net.unk_word_id)
    print("missing_word_id", net.missing_word_id)

    reverse_vocabulary = utils.get_reverse_map(net.word_vocabulary)
    reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary)

    print("Reverse role vocabulary", reverse_role_vocabulary)

    raw_words = dict(
        (reverse_role_vocabulary[r], reverse_vocabulary[net.missing_word_id])
        for r in net.role_vocabulary.values())

    if input_roles == 'all_available_args':
        possible_roles.remove(predict_role)
        input_roles = possible_roles.intersection(set(df.columns))

    all_roles = input_roles
    all_roles.add(predict_role)

    df = df.apply(
        lambda x: process_row(predict_role=predict_role,
                              role_fillers={i: x[i]
                                            for i in all_roles},
                              model=net,
                              raw_word_list=raw_words,
                              function=function,
                              n=n,
                              debug=debug),
        axis=1)

    return df