def validate_predictions_pdp(valid_lex, _1, args, f_classify, valid_y, _4, _5):
    ''' On the validation set predict the labels using f_classify.
    Compare those labels against groundtruth.
    The folder is for storing the evaluation script results so we can stare at them.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    def evaluate_predicted_parse(viterbi_parse, true_parse):
        return (len([e for e in viterbi_parse if e in true_parse]), len(true_parse))

    results = []
    for sentence, true_parse in zip(valid_lex, valid_y):
        arc_scores = f_classify(
            util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize))
        if arc_scores.ndim == 2:
            arc_scores = numpy.expand_dims(arc_scores, 2)
        viterbi_score, viterbi_parse = dp_viterbi_parse(arc_scores)
        if __debug__:
            pass
        else:
            util_lstm_seqlabel.print_domination(arc_scores)
            print(' VALIDATION: parents', [e[0] for e in viterbi_parse])
            print(' TRUE: parents', [e[0] for e in true_parse])
        results.append(evaluate_predicted_parse(viterbi_parse, true_parse))
    correct_arcs = sum(e[0] for e in results)
    total_arcs = sum(e[1] for e in results)
    tmp = float(correct_arcs) / total_arcs * 100
    results = {}
    for e in ['f1', 'p', 'r']:
        results[e] = tmp
    return results
Beispiel #2
0
def validate_predictions_seq(test_lex,
                             idx2label,
                             args,
                             f_classify,
                             groundtruth_valid,
                             words_valid,
                             fn='/current.valid.txt',
                             conv_x_to_batch=True):
    ''' On the validation set predict the labels using f_classify.
    Compare those labels against groundtruth.
    The folder is for storing the evaluation script results so we can stare at them.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    batch_size = 100
    shuf_idx_lst = util_lstm_seqlabel.get_shuffling_index_sorted_by_length(
        test_lex, shuffle=False)
    total_tokens = sum(len(e) for e in test_lex)
    tokens_done = 0
    conlleval = get_conlleval_for_task(args)
    y_pred_list = [None] * len(test_lex)
    tic = time.time()
    for expected_len, idx_list in shuf_idx_lst:
        for i, idx_batch in enumerate(rasengan.batch_list(
                idx_list, batch_size)):
            x_batch = [test_lex[e] for e in idx_batch]
            if conv_x_to_batch:
                words = numpy.array([
                    util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize)
                    for sentence in x_batch
                ])
            else:
                words = numpy.array(x_batch)
            y_pred = tolerant_y_predictor(args, f_classify, words,
                                          expected_len)
            for i_y_pred, idx in enumerate(idx_batch):
                y_pred_list[idx] = y_pred[i_y_pred]
            if args.verbose == 2 and i % 100 == 0:
                tokens_done += expected_len * len(idx_batch)
                percentage_complete = float(tokens_done) / total_tokens * 100
                util_lstm_seqlabel.print_progress(percentage_complete, tic)
                pass
            pass
        pass
    if args.folder is None:
        assert idx2label is None
        assert words_valid is None
        results_arr = [((p == g).sum(), p.shape[0])
                       for (p, g) in zip(y_pred_list, groundtruth_valid)]
        results = (float(sum(e[0] for e in results_arr)) /
                   sum(e[1] for e in results_arr))
    else:
        predictions_valid = util_lstm_seqlabel.convert_id_to_word(
            y_pred_list, idx2label)
        results = conlleval(predictions_valid, groundtruth_valid, words_valid,
                            args.folder + fn, args.folder)
    return results
def validate_predictions_pdp(valid_lex, _1, args, f_classify, valid_y, _4, _5):
    ''' On the validation set predict the labels using f_classify.
    Compare those labels against groundtruth.
    The folder is for storing the evaluation script results so we can stare at them.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    def evaluate_predicted_parse(viterbi_parse, true_parse):
        return (len([e for e in viterbi_parse if e in true_parse]), len(true_parse))

    results = []
    for sentence, true_parse in zip(valid_lex, valid_y):
        arc_scores = f_classify(
            util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize))
        if arc_scores.ndim == 2:
            arc_scores = numpy.expand_dims(arc_scores, 2)
        viterbi_score, viterbi_parse = dp_viterbi_parse(arc_scores)
        if __debug__:
            pass
        else:
            util_lstm_seqlabel.print_domination(arc_scores)
            print ' VALIDATION: parents', [e[0] for e in viterbi_parse]
            print ' TRUE: parents', [e[0] for e in true_parse]
        results.append(evaluate_predicted_parse(viterbi_parse, true_parse))
    correct_arcs = sum(e[0] for e in results)
    total_arcs = sum(e[1] for e in results)
    tmp = float(correct_arcs) / total_arcs * 100
    results = {}
    for e in ['f1', 'p', 'r']:
        results[e] = tmp
    return results
def validate_predictions_seq(
        test_lex, idx2label, args, f_classify, groundtruth_valid,
        words_valid, fn='/current.valid.txt', conv_x_to_batch=True):
    ''' On the validation set predict the labels using f_classify.
    Compare those labels against groundtruth.
    The folder is for storing the evaluation script results so we can stare at them.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    batch_size = 100
    shuf_idx_lst = util_lstm_seqlabel.get_shuffling_index_sorted_by_length(
        test_lex, shuffle=False)
    total_tokens = sum(len(e) for e in test_lex)
    tokens_done = 0
    conlleval = get_conlleval_for_task(args)
    y_pred_list = [None] * len(test_lex)
    tic = time.time()
    for expected_len, idx_list in shuf_idx_lst:
        for i, idx_batch in enumerate(rasengan.batch_list(idx_list, batch_size)):
            x_batch = [test_lex[e] for e in idx_batch]
            if conv_x_to_batch:
                words = numpy.array(
                    [util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize)
                     for sentence in x_batch])
            else:
                words = numpy.array(x_batch)
            y_pred = tolerant_y_predictor(
                args, f_classify, words, expected_len)
            for i_y_pred, idx in enumerate(idx_batch):
                y_pred_list[idx] = y_pred[i_y_pred]
            if args.verbose == 2 and i % 100 == 0:
                tokens_done += expected_len * len(idx_batch)
                percentage_complete = float(tokens_done) / total_tokens * 100
                util_lstm_seqlabel.print_progress(
                    percentage_complete, tic)
                pass
            pass
        pass
    if args.folder is None:
        assert idx2label is None
        assert words_valid is None
        results_arr = [((p == g).sum(), p.shape[0])
                       for (p, g)
                       in zip(y_pred_list, groundtruth_valid)]
        results = (float(sum(e[0] for e in results_arr))
                   / sum(e[1] for e in results_arr))
    else:
        predictions_valid = util_lstm_seqlabel.convert_id_to_word(
            y_pred_list, idx2label)
        results = conlleval(
            predictions_valid, groundtruth_valid, words_valid,
            args.folder + fn, args.folder)
    return results
def numerize(lst, Sigma, win):
    " Takes the string-valued training data and interns it "
    lst_prime = []
    bos_idx = len(Sigma)
    for one, two in lst:
        one_prime = numpy.asarray(util_lstm_seqlabel.conv_x(
            [Sigma[x] for x in one], win, bos_idx),
                                  dtype=numpy.int32)
        two_prime = numpy.asarray([Sigma[x] for x in two], dtype=numpy.int32)
        lst_prime.append((one_prime, two_prime))
    return lst_prime
def numerize(lst, Sigma, win):
    " Takes the string-valued training data and interns it "
    lst_prime = []
    bos_idx = len(Sigma)
    for one, two in lst:
        one_prime = numpy.asarray(
            util_lstm_seqlabel.conv_x(
                [Sigma[x] for x in one], win, bos_idx),
            dtype=numpy.int32)
        two_prime = numpy.asarray(
            [Sigma[x] for x in two],
            dtype=numpy.int32)
        lst_prime.append((one_prime, two_prime))
    return lst_prime
Beispiel #7
0
def train_seq(train_lex, train_y, args, ttns, training_stats):
    ''' This function is called from the main method. and it is primarily
    responsible for updating the parameters. Because of the way that
    create_circuit works that creates f_cost, f_update etc. this function
    needs to be flexible and can't be put in a lib.
    Look at lstm_dependency_parsing_simplification.py for more pointers.
    '''
    batch_size = args.batch_size
    epoch_id = training_stats['epoch_id']
    shuf_idx_lst = util_lstm_seqlabel.get_shuffling_index_sorted_by_length(
        train_lex)
    total_tokens = sum(e * len(v) for (e, v) in shuf_idx_lst)
    tokens_done = 0
    tic = time.time()
    epoch_cost = 0
    for expected_len, idx_list in shuf_idx_lst:
        if expected_len == 0:
            continue
        for i, idx_batch in enumerate(rasengan.batch_list(
                idx_list, batch_size)):
            x_batch = [train_lex[e] for e in idx_batch]
            y_batch = [train_y[e] for e in idx_batch]
            words = numpy.array([
                util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize)
                for sentence in x_batch
            ],
                                dtype=numpy.int32)
            labels = numpy.array(
                [util_lstm_seqlabel.conv_y(sentence) for sentence in y_batch],
                dtype=numpy.int32)
            # Words should be a bunch of 3D tensors of shape
            # n_sentences, n_tokens, window
            if not args.batch_input:
                for (ww, ll) in zip(words, labels):
                    epoch_cost += ttns.train_f_update(training_stats['clr'],
                                                      ww, ll)
            else:
                try:
                    epoch_cost += ttns.train_f_update(
                        training_stats['clr'] * len(idx_batch), words, labels)
                    for p in ttns.train_stack_config.differentiable_parameters(
                    ):
                        rasengan.validate_np_array(
                            p.get_value(),
                            name=p.name,
                            describe=args.describe_training)
                        pass
                except (NotImplementedError, ValueError):
                    if expected_len == 1:
                        # Desperate hack to overcome theano frustrations.
                        # Theano does not make it easy to handle
                        # corner cases. One such corner case is when
                        # the scan code receives zero length sequences
                        # which happens when the sentence has length 1
                        # because then the score of an order 1 model
                        # does not require a recursion. In any case to
                        # actually provide a prediction in this edge
                        # case we can just duplicate the sentence once in
                        # the right dimension and go on.
                        ttns.train_f_update(
                            training_stats['clr'],
                            util_lstm_seqlabel.duplicate_middle_word(words),
                            util_lstm_seqlabel.duplicate_label(labels))
                    else:
                        sys_exc_info = sys.exc_info()
                        raise (sys_exc_info[0], sys_exc_info[1],
                               sys_exc_info[2])

            tokens_done += expected_len * len(idx_batch)
            percentage_complete = float(tokens_done) / total_tokens * 100
            if args.verbose >= 3 and i % 10 == 0:
                util_lstm_seqlabel.print_progress(percentage_complete,
                                                  tic,
                                                  epoch_id=epoch_id)
                pass
            pass
        pass
    training_stats['epoch_cost'].append(epoch_cost)
    print
    print('>> Epoch completed in %.2f (sec) <<' % (time.time() - tic))
    print('>> Epoch Cost, ', epoch_cost, '<<')
    return
def train_seq(train_lex, train_y, args, ttns, training_stats):
    ''' This function is called from the main method. and it is primarily
    responsible for updating the parameters. Because of the way that
    create_circuit works that creates f_cost, f_update etc. this function
    needs to be flexible and can't be put in a lib.
    Look at lstm_dependency_parsing_simplification.py for more pointers.
    '''
    batch_size = args.batch_size
    epoch_id = training_stats['epoch_id']
    shuf_idx_lst = util_lstm_seqlabel.get_shuffling_index_sorted_by_length(
        train_lex)
    total_tokens = sum(e * len(v) for (e, v) in shuf_idx_lst)
    tokens_done = 0
    tic = time.time()
    epoch_cost = 0
    for expected_len, idx_list in shuf_idx_lst:
        if expected_len == 0:
            continue
        for i, idx_batch in enumerate(rasengan.batch_list(idx_list, batch_size)):
            x_batch = [train_lex[e] for e in idx_batch]
            y_batch = [train_y[e] for e in idx_batch]
            words = numpy.array(
                [util_lstm_seqlabel.conv_x(sentence, args.win, args.vocsize)
                 for sentence in x_batch],
                dtype=numpy.int32)
            labels = numpy.array(
                [util_lstm_seqlabel.conv_y(sentence)
                 for sentence in y_batch],
                dtype=numpy.int32)
            # Words should be a bunch of 3D tensors of shape
            # n_sentences, n_tokens, window
            if not args.batch_input:
                for (ww, ll) in zip(words, labels):
                    epoch_cost += ttns.train_f_update(
                        training_stats['clr'], ww, ll)
            else:
                try:
                    epoch_cost += ttns.train_f_update(
                        training_stats['clr'] * len(idx_batch), words, labels)
                    for p in ttns.train_stack_config.differentiable_parameters():
                        rasengan.validate_np_array(
                            p.get_value(), name=p.name,
                            describe=args.describe_training)
                        pass
                except (NotImplementedError, ValueError):
                    if expected_len == 1:
                        # Desperate hack to overcome theano frustrations.
                        # Theano does not make it easy to handle
                        # corner cases. One such corner case is when
                        # the scan code receives zero length sequences
                        # which happens when the sentence has length 1
                        # because then the score of an order 1 model
                        # does not require a recursion. In any case to
                        # actually provide a prediction in this edge
                        # case we can just duplicate the sentence once in
                        # the right dimension and go on.
                        ttns.train_f_update(
                            training_stats['clr'],
                            util_lstm_seqlabel.duplicate_middle_word(words),
                            util_lstm_seqlabel.duplicate_label(labels))
                    else:
                        sys_exc_info = sys.exc_info()
                        raise sys_exc_info[0], sys_exc_info[1], sys_exc_info[2]

            tokens_done += expected_len * len(idx_batch)
            percentage_complete = float(tokens_done) / total_tokens * 100
            if args.verbose >= 3 and i % 10 == 0:
                util_lstm_seqlabel.print_progress(
                    percentage_complete, tic, epoch_id=epoch_id)
                pass
            pass
        pass
    training_stats['epoch_cost'].append(epoch_cost)
    print
    print '>> Epoch completed in %.2f (sec) <<' % (time.time() - tic)
    print '>> Epoch Cost, ', epoch_cost, '<<'
    return