def train(self, x, y, window_size, learning_rate):
        cwords = contextwin(x, window_size)
        words = [numpy.asarray(x).astype('int32') for x in cwords]
        labels = y

        self.sentence_train(words, labels, learning_rate)
        if self.normal:
            self.normalize()
    def train(self, x, y, window_size, learning_rate):

        cwords = contextwin(x, window_size)
        words = list(map(lambda x: numpy.asarray(x).astype('int32'), cwords))
        labels = y

        self.sentence_train(words, labels, learning_rate)
        if self.normal:
            self.normalize()
Exemple #3
0
    #reader = Reader(md)

    with open(os.path.join(directory_model, 'reader.pkl'), 'rb') as f:
        reader = pickle.load(f)

    num_tags = len(reader.tag_dict)
    num_words = len(reader.word_dict)
    model = JordanRnn(args.hidden, num_tags, num_words, args.num_features, args.window)
    print('... loading models')
    model.load(directory_model)

    print('>>> READY')
    while True:
        sent = input()
        coded = reader.codify_string(sent)
        framed = numpy.asarray(\
                    utils.contextwin(coded, args.window,\
                    reader.get_padding_left(), reader.get_padding_right()\
                    ), dtype=numpy.int32)
        coded_tags = model.classify(framed)
        tags = [reader.reverse_tag_dict[t] for t in coded_tags]

        print('[INPUT] ' + str(sent))
        print('[CODED] ' + str(coded))
        print('[ TAG ] ' + str(coded_tags))
        print('[UNTAG] ' + str(tags))
        print()


Exemple #4
0
def test_lstm(**kwargs):
    """
    Wrapper function for training and testing LSTM

    :type fold: int
    :param fold: fold index of the ATIS dataset, from 0 to 4.

    :type lr: float
    :param lr: learning rate used (factor for the stochastic gradient).

    :type nepochs: int
    :param nepochs: maximal number of epochs to run the optimizer.

    :type win: int
    :param win: number of words in the context window.

    :type nhidden: int
    :param n_hidden: number of hidden units.

    :type emb_dimension: int
    :param emb_dimension: dimension of word embedding.

    :type verbose: boolean
    :param verbose: to print out epoch summary or not to.

    :type decay: boolean
    :param decay: decay on the learning rate if improvement stop.

    :type savemodel: boolean
    :param savemodel: save the trained model or not.

    :type normal: boolean
    :param normal: normalize word embeddings after each update or not.

    :type folder: string
    :param folder: path to the folder where results will be stored.

    """
    # process input arguments
    param = {
        'experiment': 'standard',
        'lr': 0.1,
        'verbose': True,
        'decay': True,
        'win': 3,
        'nhidden': 300,
        'nhidden2': 300,
        'seed': 345,
        'emb_dimension': 90,
        'nepochs': 40,
        'savemodel': False,
        'normal': True,
        'layer_norm': False,
        'minibatch_size': 4978,
        'folder': '../result'
    }

    param_diff = set(kwargs.keys()) - set(param.keys())
    if param_diff:
        raise KeyError("invalid arguments:" + str(tuple(param_diff)))
    param.update(kwargs)

    if param['verbose']:
        for k, v in param.items():
            print("%s: %s" % (k, v))

    # create result folder if not exists
    check_dir(param['folder'])

    # load the dataset
    print('... loading the dataset')
    train_set, valid_set, test_set, dic = load_data(3)

    train_set = list(train_set)
    valid_set = list(valid_set)

    # Add validation set to train set
    for i in range(3):
        train_set[i] += valid_set[i]

    # create mapping from index to label, and index to word
    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
    idx2word = dict((k, v) for v, k in dic['words2idx'].items())

    # unpack dataset
    train_lex, train_ne, train_y = train_set
    test_lex, test_ne, test_y = test_set

    n_trainbatches = len(train_lex) // param['minibatch_size']

    print("Sentences in train: %d, Words in train: %d" %
          (count_of_words_and_sentences(train_lex)))
    print("Sentences in test: %d, Words in test: %d" %
          (count_of_words_and_sentences(test_lex)))

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    groundtruth_test = [[idx2label[x] for x in y] for y in test_y]
    words_test = [[idx2word[x] for x in w] for w in test_lex]

    # instanciate the model
    numpy.random.seed(param['seed'])
    random.seed(param['seed'])

    print('... building the model')
    lstm = LSTM(n_hidden=param['nhidden'],
                n_hidden2=param['nhidden2'],
                n_out=nclasses,
                n_emb=vocsize,
                dim_emb=param['emb_dimension'],
                cwind_size=param['win'],
                normal=param['normal'],
                layer_norm=param['layer_norm'],
                experiment=param['experiment'])

    # train with early stopping on validation set
    print('... training')
    best_f1 = -numpy.inf
    param['clr'] = param['lr']
    for e in range(param['nepochs']):

        # shuffle
        shuffle([train_lex, train_ne, train_y], param['seed'])

        param['ce'] = e
        tic = timeit.default_timer()

        for minibatch_index in range(n_trainbatches):

            for i in range(minibatch_index * param['minibatch_size'],
                           (1 + minibatch_index) * param['minibatch_size']):
                x = train_lex[i]
                y = train_y[i]
                res = lstm.train(x, y, param['win'], param['clr'])

            predictions_test = [[
                idx2label[x] for x in lstm.classify(
                    numpy.asarray(contextwin(x, param['win'])).astype('int32'))
            ] for x in test_lex]

            # evaluation // compute the accuracy using conlleval.pl
            res_test = conlleval(predictions_test, groundtruth_test,
                                 words_test,
                                 param['folder'] + '/current.test.txt',
                                 param['folder'])

            if res_test['f1'] > best_f1:

                if param['savemodel']:
                    lstm.save(param['folder'])

                best_lstm = copy.deepcopy(lstm)
                best_f1 = res_test['f1']

                if param['verbose']:
                    print(
                        'NEW BEST: epoch %d, minibatch %d/%d, best test F1: %.3f'
                        % (e, minibatch_index + 1, n_trainbatches,
                           res_test['f1']))

                param['tf1'] = res_test['f1']
                param['tp'] = res_test['p']
                param['tr'] = res_test['r']
                param['be'] = e

                os.rename(param['folder'] + '/current.test.txt',
                          param['folder'] + '/best.test.txt')
            else:
                if param['verbose']:
                    print('')

        # learning rate decay if no improvement in 10 epochs
        if param['decay'] and abs(param['be'] - param['ce']) >= 10:
            param['clr'] *= 0.5
            print("Decay happened. New Learning Rate:", param['clr'])
            lstm = best_lstm

        if param['clr'] < 0.00001:
            break

    print('BEST RESULT: epoch', param['be'], 'best test F1', param['tf1'],
          'with the model', param['folder'])

    return lstm, dic
Exemple #5
0
def test_lstm_parity(n_bit, fil):
    n_bit = n_bit
    n_hidden = 1
    n_epochs = 1000
    #For 8 bit:
    #learning_rate=0.15
    #For 12 bit:
    learning_rate = 0.5
    n_win = 7
    verbose = True
    f = fil
    print('... loading the dataset')
    print >> f, '... loading the dataset'
    # generate datasets
    train_set_x, train_set_y = gen_parity_pair(n_bit, 1000)
    valid_set_x, valid_set_y = gen_parity_pair(n_bit, 500)
    test_set_x, test_set_y = gen_parity_pair(n_bit, 100)

    numpy.random.seed(100)
    #We need additional labels

    train_set_y = create_additioinal_label(train_set_x)

    valid_set_y = create_additioinal_label(valid_set_x)

    test_set_y = create_additioinal_label(test_set_x)

    n_out = 2

    print('... building the model')
    print >> f, '... building the model'
    lstm = LSTM(
        nh=n_hidden,
        nc=n_out,
        cs=n_win,
    )

    start_time = timeit.default_timer()
    # train with early stopping on validation set
    print('... training')
    print >> f, '... training'
    best_perform = numpy.inf

    for e in range(n_epochs):

        print('epoch:%d->' % e)
        print >> f, ('epoch:->%d' % e)
        for i, (x, y) in enumerate(zip(train_set_x, train_set_y)):
            lstm.train(x, y, n_win, learning_rate)

        pred_train = np.asarray([
            lstm.classify(numpy.asarray(contextwin(x, n_win)).astype('int32'))
            for x in train_set_x
        ])
        pred_valid = np.asarray([
            lstm.classify(numpy.asarray(contextwin(x, n_win)).astype('int32'))
            for x in valid_set_x
        ])
        pred_test = np.asarray([
            lstm.classify(numpy.asarray(contextwin(x, n_win)).astype('int32'))
            for x in test_set_x
        ])

        #Mean Square Error
        res_train = np.mean(
            np.asarray(
                (train_set_y[:, n_bit - 1] - pred_train[:, n_bit - 1])**2))
        res_valid = np.mean(
            np.asarray(
                (valid_set_y[:, n_bit - 1] - pred_valid[:, n_bit - 1])**2))
        res_test = np.mean(
            np.asarray(
                (test_set_y[:, n_bit - 1] - pred_test[:, n_bit - 1])**2))

        print('cost(mse): %f' % res_train)
        print >> f, ('cost(mse): %f' % res_train)

        if res_valid < best_perform:

            best_perform = res_valid

            if verbose:
                print(
                    'NEW BEST: epoch %i, valid error %.4f %%, best test error %.4f %%'
                    % (e, res_valid * 100., res_test * 100.))
                print >> f, (
                    'NEW BEST: epoch %i, valid error %.4f %%, best test error %.4f %%'
                    % (e, res_valid * 100., res_test * 100.))
            valid_error, test_error = res_valid, res_test
            best_epoch = e
        else:
            print('')
            print >> f, ''

        # learning rate decay if no improvement in 10 epochs
        if abs(best_epoch - e) >= 10:
            learning_rate *= 0.5

        if learning_rate < 1e-5:
            break

    print(
        'BEST RESULT: epoch %i, valid error %.4f %%, best test error %.4f %%' %
        (best_epoch, valid_error * 100., test_error * 100.))
    print >> f, (
        'BEST RESULT: epoch %i, valid error %.4f %%, best test error %.4f %%' %
        (best_epoch, valid_error * 100., test_error * 100.))
    end_time = timeit.default_timer()
    print((' ran for %.2fm' % ((end_time - start_time) / 60.)))
    print >> f, ((' ran for %.2fm' % ((end_time - start_time) / 60.)))
    f.close()
Exemple #6
0
    md = Metadata(args.filename)
    directory_model = 'bestModel'

    if args.load_reader:
        with open(os.path.join(directory_model, 'reader.pkl'), 'rb') as f:
            reader = pickle.load(f)
    else:
        reader = Reader(md)
        reader.save(directory_model)

    # Generate the training set
    num_sentences = len(reader.sentences)
    num_words = len(reader.word_dict)
    codified_sentences = [numpy.asarray(\
            utils.contextwin([t.codified_word for t in s], args.window,\
            reader.get_padding_left(), reader.get_padding_right()\
            ), dtype=numpy.int32)\
            for s in reader.sentences]

    #print('codified_sentences', codified_sentences)
    #sentences_shared = theano.shared(codified_sentences)

    num_tags = len(reader.tag_dict)
    codified_tags = [numpy.asarray([t.codified_tag for t in s], dtype=numpy.int32) for s in reader.sentences]

    #print('codified_tags', codified_tags)
    #tags_shared = theano.shared(codified_tags)

    model = JordanRnn(args.hidden, num_tags, num_words, args.num_features, args.window)
    print('... loading models')
    model.load(directory_model)
Exemple #7
0
    # Generate the training set
    num_sentences = len(reader.sentences)
    num_words = len(reader.word_dict)
    num_tags = len(reader.tag_dict)

    if args.validation_filename:
        valid_md = Metadata(args, args.validation_filename, args.fixed_embeddings or args.learn_embeddings)
        valid_reader = Reader(valid_md)
        valid_reader.word_dict = reader.word_dict
        valid_reader.tag_dict = reader.tag_dict
        valid_reader.codify_sentences()

    if args.fixed_embeddings:
        codified_sentences = [numpy.concatenate(numpy.asarray(\
                utils.contextwin([reader.get_embedding(t.codified_word) for t in s],
                    args.window,\
                reader.get_padding_left(), reader.get_padding_right()\
                ), dtype=theano.config.floatX), axis=0)\
                for s in reader.sentences]

        if args.validation_filename:
            codified_sentences_valid = [numpy.concatenate(numpy.asarray(\
                    utils.contextwin([reader.get_embedding(t.codified_word) for t in s],
                        args.window,\
                    reader.get_padding_left(), reader.get_padding_right()\
                    ), dtype=theano.config.floatX), axis=0)\
                    for s in valid_reader.sentences]

    else:

        codified_sentences = [numpy.asarray(\
                utils.contextwin([t.codified_word for t in s], args.window,\