Beispiel #1
0
def predict(_args, lex_test, idxs_test, f_classify, groundtruth_test, batchsize=1, graph=False, dep=None, weighted=False, print_prediction=False, prediction_file=None):
    ''' On the test set predict the labels using f_classify.
    Compare those labels against groundtruth.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    predictions_test = []
    if print_prediction:
        assert prediction_file is not None
        pred_file = open(prediction_file, 'w')
    if batchsize > 1:
        nb_idxs = get_minibatches_idx(len(lex_test), batchsize, shuffle=False)
        for i, tr_idxs in enumerate(nb_idxs):
            words = [lex_test[ii] for ii in tr_idxs]
            eidxs = [idxs_test[ii] for ii in tr_idxs]
            #labels = [groundtruth_test[ii] for ii in tr_idxs]
            orig_eidxs = eidxs
            if graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, masks, eidxs = prepare_data(words, eidxs, masks, maxlen=200)
            if weighted or not graph:
                pred_all = f_classify(x, masks, *eidxs)
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) #[0]))
            else:
                pred_all = f_classify(x, masks.sum(axis=-1), *eidxs)
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1)))
            if print_prediction:
                for idx, p in zip(tr_idxs, pred_all): 
                    pred_file.write(str(idx) + '\t' + str(p[1]) + '\n')
    else:
        for i, (word, idxs) in enumerate(zip(lex_test, idxs_test)):
            idxs = conv_idxs(idxs, len(word))
            if graph:
                assert dep is not None
                if weighted:
                    predictions_test.append(f_classify(word, dep[i], *idxs))  #.sum(axis=-1) 
                else:
                    predictions_test.append(f_classify(word, dep[i].sum(axis=-1), *idxs))  #.sum(axis=-1) 
            else:
                predictions_test.append(f_classify(word, *idxs))
    print 'in predict,', len(predictions_test), len(groundtruth_test)
    if print_prediction:
        pred_file.close()
    #results = eval_logitReg_F1(predictions_test, groundtruth_test) 
    results = eval_logitReg_accuracy(predictions_test, groundtruth_test)
    return results, predictions_test
Beispiel #2
0
def train_single(train_lex,
                 train_idxs,
                 train_y,
                 _args,
                 f_cost,
                 f_update,
                 epoch_id,
                 learning_rate,
                 nsentences,
                 batchsize=1,
                 dep=None,
                 weighted=False):
    ''' This function is called from the main method. and it is primarily responsible for updating the
    parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function
    needs to be flexible and can't be put in a lib.
    Look at lstm_dependency_parsing_simplification.py for more pointers.
    '''

    # None-batched version
    def train_instance(words, idxs, sample_weights, label, learning_rate,
                       f_cost, f_update):
        ' Since function is called only for side effects, it is likely useless anywhere else'
        if words.shape[0] < 2:
            return 0.0

        # need to change here, add sample weights
        inputs = idxs + [words, sample_weights, label]
        iter_cost = f_cost(*inputs)  #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost

    # Mini-batch version
    def train_batch(words, masks, idxs, sample_weights, label, learning_rate,
                    f_cost, f_update):
        if words.shape[0] < 2:
            return 0.0

        # need to change here, add sample weights
        inputs = idxs + [words, masks, sample_weights, label]
        iter_cost = f_cost(*inputs)  #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost

    ## main body of train

    # generate the weights according to the train label distribution
    total_pos = 0
    total_neg = 0
    for y in train_y:
        if y[0] == 0 and y[1] == 1:
            total_pos += 1
        else:
            total_neg += 1

    print("total pos: %d neg:%d \n" % (total_pos, total_neg))

    sample_weights = [0] * (total_neg + total_pos)
    for idx, y in enumerate(train_y):
        if y[0] == 0 and y[1] == 1:
            sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_pos)
        else:
            sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_neg)

    if dep:
        shuffle([train_lex, train_idxs, train_y, sample_weights, dep],
                _args.seed)
    else:
        shuffle([train_lex, train_idxs, train_y, sample_weights], _args.seed)

    if nsentences < len(train_lex):
        train_lex = train_lex[:nsentences]
        train_idxs = train_idxs[:nsentences]
        train_y = train_y[:nsentences]
        sample_weights = sample_weights[:nsentences]

    tic = time.time()
    aggregate_cost = 0.0
    temp_cost_arr = [0.0] * 2

    # make the judge on whether use mini-batch or not.
    # No mini-batch
    if batchsize == 1:
        for i, (words, idxs, label, weight) in enumerate(
                zip(train_lex, train_idxs, train_y, sample_weights)):
            if len(words) < 2:
                continue
            #assert len(words) == len(labels) #+ 2
            idxs = conv_idxs(idxs, len(words))
            if _args.graph:
                assert dep is not None
                if weighted:
                    aggregate_cost += train_batch(words, dep[i], idxs, weight,
                                                  label, learning_rate, f_cost,
                                                  f_update)
                else:
                    aggregate_cost += train_batch(words, dep[i].sum(axis=-1),
                                                  idxs, weight, label,
                                                  learning_rate, f_cost,
                                                  f_update)
            else:
                aggregate_cost += train_instance(words, idxs, weight, label,
                                                 learning_rate, f_cost,
                                                 f_update)
            if _args.verbose == 2 and i % 10 == 0:
                print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) *
                                                          100. / nsentences),
                print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (
                    time.time() - tic, aggregate_cost / (i + 1)),
                sys.stdout.flush()
    # Mini-batch
    else:
        nb_idxs = get_minibatches_idx(len(train_lex), batchsize, shuffle=False)
        nbatches = len(nb_idxs)
        for i, tr_idxs in enumerate(nb_idxs):
            words = [train_lex[ii] for ii in tr_idxs]
            eidxs = [train_idxs[ii] for ii in tr_idxs]
            labels = [train_y[ii] for ii in tr_idxs]
            weights = [sample_weights[ii] for ii in tr_idxs]

            orig_eidxs = eidxs
            if _args.graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, masks, eidxs, weight = prepare_data(words,
                                                   eidxs,
                                                   masks,
                                                   weights,
                                                   maxlen=200)

            #print 'mask shape:', masks.shape
            if weighted or dep is None:
                iter_cost = train_batch(x, masks, eidxs, weight, labels,
                                        learning_rate, f_cost, f_update)
                aggregate_cost += iter_cost  #[0]
            else:
                aggregate_cost += train_batch(x, masks.sum(axis=-1), eidxs,
                                              weight, labels, learning_rate,
                                              f_cost, f_update)
            if _args.verbose == 2:
                print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) *
                                                          100. / nbatches),
                print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (
                    time.time() - tic, aggregate_cost / (i + 1)),
                #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)),
                #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)),
                sys.stdout.flush()
    if _args.verbose == 2:
        print '\n>> Epoch completed in %.2f (sec) <<' % (
            time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
def predict(_args, f_classify, *data, **kwargs): #batchsize=1, graph=False, dep=None, weighted=False, print_prediction=False, prediction_file=None):
    ''' On the test set predict the labels using f_classify.
    Compare those labels against groundtruth.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    batchsize = kwargs.pop('batchsize', 1)
    dep = kwargs.pop('dep', None)
    weighted = kwargs.pop('weighted', False)
    print_prediction = kwargs.pop('print_prediction', False)
    prediction_file = kwargs.pop('prediction_file', None)
    groundtruth_test = data[-1]


    predictions_test = []
    if print_prediction:
        assert prediction_file is not None
        pred_file = open(prediction_file, 'w')
    if batchsize > 1:
        nb_idxs = get_minibatches_idx(len(data[0]), batchsize, shuffle=False)
        for i, tr_idxs in enumerate(nb_idxs):
            #words = [lex_test[ii] for ii in tr_idxs]
            #eidxs = [idxs_test[ii] for ii in tr_idxs]
            #labels = [groundtruth_test[ii] for ii in tr_idxs]
            #orig_eidxs = eidxs
            batch_data = [[elem[ii] for ii in tr_idxs] for elem in data]
            if _args.graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, x_masks, obj, obj_masks = prepare_data(batch_data[0], batch_data[1], masks, None, maxlen=200)
            if weighted or not _args.graph:
                pred_all = f_classify( x, obj, x_masks, obj_masks)
                #print len(pred_all)
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) #[0]))
                '''print pred_all[1].shape, len(words), len(orig_eidxs)
                for iii, (line, att, idx, pred) in enumerate(zip(words, pred_all[1].T, orig_eidxs, pred_all[0])):
                    wds = [_args.idx2word[wd[0]] for wd in line]
                    try:
                        assert len(wds) <= len(att)
                    except:
                        print len(wds), len(att)
                    print wds, pred, groundtruth_test[i*batchsize + iii]
                    print att
                '''
            else:
                #print f_classify(x, masks.sum(axis=-1), *eidxs)
                pred_all = f_classify(x,  obj,  masks.sum(axis=-1) )
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1)))
            if print_prediction:
                for idx, p in zip(tr_idxs, pred_all):
                    pred_file.write(str(idx) + '\t' + str(p[1]) + '\n')
    else:
        print "ERRRRRRRRRRRO"
        pass
    print 'in predict,', len(predictions_test), len(groundtruth_test)
    if print_prediction:
        pred_file.close()
    predictions_test = map(lambda k: _args.idx2label[k].split('(')[0], predictions_test)

    if groundtruth_test[0] == 0 or groundtruth_test[0] == 1:
        groundtruth_test = map(lambda k: _args.idx2label[k].split('(')[0], groundtruth_test)

    #eval_logitReg_F1(predictions_test, groundtruth_test)
    results = eval_logitReg_accuracy(predictions_test, groundtruth_test)
    print "Results:", results
    return results, predictions_test
def train_single(_args, f_cost, f_update, epoch_id, learning_rate, nsentences, *data, **kwargs): #train_lex, train_idxs, train_y, batchsize=1, dep=None, weighted=False):
    ''' This function is called from the main method. and it is primarily responsible for updating the
    parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function
    needs to be flexible and can't be put in a lib.
    Look at lstm_dependency_parsing_simplification.py for more pointers.
    '''
    batchsize = kwargs.pop('batchsize', 1)
    dep = kwargs.pop('dep', None)
    weighted = kwargs.pop('weighted', False)
    # None-batched version
    def train_instance(learning_rate, f_cost, f_update, *inputs):
        ' Since function is called only for side effects, it is likely useless anywhere else'
        if inputs[0].shape[0] < 2:
            return 0.0
        #inputs = idxs + [words, label]
        iter_cost = f_cost(*inputs) #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost

    # Mini-batch version
    '''def train_batch(words, masks, idxs, label, learning_rate, f_cost, f_update):
        if words.shape[0] < 2:
            return 0.0
        inputs = idxs + [words, masks, label]
        iter_cost = f_cost(*inputs) #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost
    '''
    ## main body of train
    #print type(data)
    data = list(data)
    if dep:
        #shuffle([train_lex, train_idxs, train_y, dep], _args.seed)
        shuffle(data + [dep], _args.seed)
    else:
        shuffle(data, _args.seed)
    if nsentences < len(data[0]):
        data = [elem[:nsentences] for elem in data]
    tic = time.time()
    aggregate_cost = 0.0
    temp_cost_arr = [0.0] * 2

    # make the judge on whether use mini-batch or not.
    # No mini-batch
    if batchsize == 1:
        print "Error: batch size cannot be 1"
        pass
    # Mini-batch
    else:
        nb_idxs = get_minibatches_idx(len(data[0]), batchsize, shuffle=False)
        nbatches = len(nb_idxs)
        for i, tr_idxs in enumerate(nb_idxs):
            #words = [train_lex[ii] for ii in tr_idxs]
            #eidxs = [train_idxs[ii] for ii in tr_idxs]
            #labels = [train_y[ii] for ii in tr_idxs]
            #print [len(elem) for elem in data]
            batch_data = [[elem[ii] for ii in tr_idxs] for elem in data]
            #orig_eidxs = eidxs
            if _args.graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, x_masks, obj, obj_masks = prepare_data(batch_data[0], batch_data[1], masks, None, maxlen=200)
            '''print x.shape, len(words)
            for elem, wd in zip(numpy.transpose(x, (1,0,2)), words):
                print 'words:', wd
                print 'converted words:', elem
            '''
            if weighted or dep is None:
                iter_cost = train_instance(learning_rate, f_cost, f_update, x, obj, batch_data[-1], x_masks, obj_masks )

                ## for debug with professor and Nunyin ##
                # print len(x), len(x_masks), len(obj), len(batch_data[-1]), len(obj_masks)
                # print x
                # print obj
                # print x_masks
                # print obj_masks
                # print batch_data[-1]
                # print iter_cost
                ## for debug with professor and Nunyin ##

                #for ii, c in enumerate(iter_cost):
                #    temp_cost_arr[ii] += c
                aggregate_cost += iter_cost#[0]

            else:
                aggregate_cost += train_instance(learning_rate, f_cost, f_update, x, obj, batch_[-1], masks.sum(axis=-1) )
            if _args.verbose == 2 :
                print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) * 100. / nbatches),
                print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (time.time() - tic, aggregate_cost/(i+1)),
                #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)),
                #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)),
                sys.stdout.flush()
    if _args.verbose == 2:
        print '\n>> Epoch completed in %.2f (sec) <<' % (time.time() - tic), 'training cost: %.2f' % (aggregate_cost)